bitkeeper revision 1.1108.47.1 (410d0894HhCNQH1pLGY2q0pUmuKCfQ)
authorcl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>
Sun, 1 Aug 2004 15:13:24 +0000 (15:13 +0000)
committercl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>
Sun, 1 Aug 2004 15:13:24 +0000 (15:13 +0000)
add block backend driver for Linux 2.6

21 files changed:
.rootkeys
linux-2.6.7-xen-sparse/arch/xen/Kconfig
linux-2.6.7-xen-sparse/arch/xen/configs/xen0_defconfig
linux-2.6.7-xen-sparse/arch/xen/configs/xenU_defconfig
linux-2.6.7-xen-sparse/drivers/xen/Makefile
linux-2.6.7-xen-sparse/drivers/xen/blkback/Makefile [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkback/common.h [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkback/control.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkback/interface.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkback/vbd.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkfront/Kconfig [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkfront/Makefile [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkfront/block.h [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/blkfront/vbd.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/block/Kconfig [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/block/Makefile [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/block/block.c [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/block/block.h [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/block/vbd.c [deleted file]

index f38eb42f17cc1103b3318937c12ecbc0b555ae60..ac0845c34f5502a5a7119544909c59fccbbf00ab 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3f68905c5eiA-lBMQSvXLMWS1ikDEA linux-2.6.7-xen-sparse/arch/xen/kernel/xen_proc.c
 4108f5c1WfTIrs0HZFeV39sttekCTw linux-2.6.7-xen-sparse/drivers/char/mem.c
 40f56239Dp_vMTgz8TEbvo1hjHGc3w linux-2.6.7-xen-sparse/drivers/xen/Makefile
-40f56239Sfle6wGv5FS0wjS_HI150A linux-2.6.7-xen-sparse/drivers/xen/block/Kconfig
-40f562395atl9x4suKGhPkjqLOXESg linux-2.6.7-xen-sparse/drivers/xen/block/Makefile
-40f56239-JNIaTzlviVJohVdoYOUpw linux-2.6.7-xen-sparse/drivers/xen/block/block.c
-40f56239y9naBTXe40Pi2J_z3p-d1g linux-2.6.7-xen-sparse/drivers/xen/block/block.h
-40f56239BVfPsXBiWQitXgDRtOsiqg linux-2.6.7-xen-sparse/drivers/xen/block/vbd.c
+410d0893otFGghmv4dUXDUBBdY5aIA linux-2.6.7-xen-sparse/drivers/xen/blkback/Makefile
+4087cf0d1XgMkooTZAiJS6NrcpLQNQ linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c
+4087cf0dZadZ8r6CEt4fNN350Yle3A linux-2.6.7-xen-sparse/drivers/xen/blkback/common.h
+4087cf0dxlh29iw0w-9rxOCEGCjPcw linux-2.6.7-xen-sparse/drivers/xen/blkback/control.c
+4087cf0dbuoH20fMjNZjcgrRK-1msQ linux-2.6.7-xen-sparse/drivers/xen/blkback/interface.c
+4087cf0dk97tacDzxfByWV7JifUYqA linux-2.6.7-xen-sparse/drivers/xen/blkback/vbd.c
+40f56239Sfle6wGv5FS0wjS_HI150A linux-2.6.7-xen-sparse/drivers/xen/blkfront/Kconfig
+40f562395atl9x4suKGhPkjqLOXESg linux-2.6.7-xen-sparse/drivers/xen/blkfront/Makefile
+40f56239-JNIaTzlviVJohVdoYOUpw linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c
+40f56239y9naBTXe40Pi2J_z3p-d1g linux-2.6.7-xen-sparse/drivers/xen/blkfront/block.h
+40f56239BVfPsXBiWQitXgDRtOsiqg linux-2.6.7-xen-sparse/drivers/xen/blkfront/vbd.c
 40f56239fsLjvtD8YBRAWphps4FDjg linux-2.6.7-xen-sparse/drivers/xen/console/Makefile
 3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c
 40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile
index e4db83fee3dbc158b051006fde486ea70dca4d85..c1eb03cda17327be6cd2f6156116663f761355d6 100644 (file)
@@ -38,6 +38,11 @@ config XEN_PHYSDEV_ACCESS
 
 endmenu
 
+# Xen's block device backend driver needs 2^12 pages
+config FORCE_MAX_ZONEORDER
+        int
+        default "12" if XEN_PHYSDEV_ACCESS
+        default "11" if !XEN_PHYSDEV_ACCESS
 
 #config VT
 #      bool
index d82b0afec504127b0c62102a63861fe850114d07..876d09312618caef754f8ba67cffd2f3f7cdbfd4 100644 (file)
@@ -10,6 +10,7 @@ CONFIG_NO_IDLE_HZ=y
 #
 CONFIG_XEN_PRIVILEGED_GUEST=y
 CONFIG_XEN_PHYSDEV_ACCESS=y
+CONFIG_FORCE_MAX_ZONEORDER=12
 CONFIG_X86=y
 # CONFIG_X86_64 is not set
 
index a56fc3f7d4f0b87ef28cc99084dead68a407330b..fb550771994909197b56c556be9aa303177db404 100644 (file)
@@ -10,6 +10,7 @@ CONFIG_NO_IDLE_HZ=y
 #
 # CONFIG_XEN_PRIVILEGED_GUEST is not set
 # CONFIG_XEN_PHYSDEV_ACCESS is not set
+CONFIG_FORCE_MAX_ZONEORDER=11
 CONFIG_X86=y
 # CONFIG_X86_64 is not set
 
index ae06ee79a7f72d791d4b782f7cf4260c5b29bcdc..f002a933344ba7576930f6f79874786eb6cbf6bb 100644 (file)
@@ -1,9 +1,11 @@
 
 
-obj-y  += block/
+obj-y  += blkfront/
 obj-y  += console/
 obj-y  += evtchn/
 obj-y  += netfront/
 obj-y  += privcmd/
 
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS)       += blkback/
 obj-$(CONFIG_XEN_PHYSDEV_ACCESS)       += netback/
+
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/blkback/Makefile
new file mode 100644 (file)
index 0000000..a27fe65
--- /dev/null
@@ -0,0 +1,2 @@
+
+obj-y  := blkback.o control.o interface.o vbd.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c b/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c
new file mode 100644 (file)
index 0000000..f5a688a
--- /dev/null
@@ -0,0 +1,588 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ * 
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * NB. We place a page of padding between each buffer page to avoid incorrect
+ * merging of requests by the IDE and SCSI merging routines. Otherwise, two
+ * adjacent buffers in a scatter-gather request would have adjacent page
+ * numbers: since the merge routines don't realise that this is in *pseudophys*
+ * space, not real space, they may collapse the s-g elements!
+ */
+static unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+    (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * 2 * PAGE_SIZE))
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+    blkif_t       *blkif;
+    unsigned long  id;
+    int            nr_pages;
+    atomic_t       pendcnt;
+    unsigned short operation;
+    int            status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of 
+ * order. We therefore maintain an allocation ring. This ring also indicates 
+ * when enough work has been passed down -- at that point the allocation ring 
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#if 0
+static kmem_cache_t *buffer_head_cachep;
+#endif
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st);
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+    int               i;
+
+    for ( i = 0; i < nr_pages; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping;
+        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = 0;
+        mcl[i].args[2] = 0;
+    }
+
+    mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
+    if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
+        BUG();
+}
+
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head blkio_schedule_list;
+static spinlock_t blkio_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+    return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( !__on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( __on_blkdev_list(blkif) )
+    {
+        list_del(&blkif->blkdev_list);
+        blkif->blkdev_list.next = NULL;
+        blkif_put(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( __on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+    {
+        list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
+        blkif_get(blkif);
+    }
+    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
+
+static int blkio_schedule(void *arg)
+{
+    DECLARE_WAITQUEUE(wq, current);
+
+    blkif_t          *blkif;
+    struct list_head *ent;
+
+    for ( ; ; )
+    {
+        /* Wait for work to do. */
+        add_wait_queue(&blkio_schedule_wait, &wq);
+        set_current_state(TASK_INTERRUPTIBLE);
+        if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
+             list_empty(&blkio_schedule_list) )
+            schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&blkio_schedule_wait, &wq);
+
+        /* Queue up a batch of requests. */
+        while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+                !list_empty(&blkio_schedule_list) )
+        {
+            ent = blkio_schedule_list.next;
+            blkif = list_entry(ent, blkif_t, blkdev_list);
+            blkif_get(blkif);
+            remove_from_blkdev_list(blkif);
+            if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+                add_to_blkdev_list_tail(blkif);
+            blkif_put(blkif);
+        }
+        
+#if 0                          /* XXXcl tq */
+        /* Push the batch through to disc. */
+        run_task_queue(&tq_disk);
+#endif
+    }
+}
+
+static void maybe_trigger_blkio_schedule(void)
+{
+    /*
+     * Needed so that two processes, who together make the following predicate
+     * true, don't both read stale values and evaluate the predicate
+     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+     */
+    smp_mb();
+
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&blkio_schedule_list) )
+        wake_up(&blkio_schedule_wait);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
+{
+    unsigned long flags;
+
+    /* An error fails the entire request. */
+    if ( !uptodate )
+    {
+        DPRINTK("Buffer not up-to-date at end of operation\n");
+        pending_req->status = BLKIF_RSP_ERROR;
+    }
+
+    if ( atomic_dec_and_test(&pending_req->pendcnt) )
+    {
+        int pending_idx = pending_req - pending_reqs;
+        fast_flush_area(pending_idx, pending_req->nr_pages);
+        make_response(pending_req->blkif, pending_req->id,
+                      pending_req->operation, pending_req->status);
+        blkif_put(pending_req->blkif);
+        spin_lock_irqsave(&pend_prod_lock, flags);
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        spin_unlock_irqrestore(&pend_prod_lock, flags);
+        maybe_trigger_blkio_schedule();
+    }
+}
+
+static int end_block_io_op(struct bio *bio, unsigned int done, int error)
+{
+    if (done || error)         /* XXXcl */
+       __end_block_io_op(bio->bi_private, done);
+#if 0
+    kmem_cache_free(buffer_head_cachep, bh);
+#else
+    bio_put(bio);
+#endif
+    return error;
+}
+
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    blkif_t *blkif = dev_id;
+    add_to_blkdev_list_tail(blkif);
+    maybe_trigger_blkio_schedule();
+    return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+    blkif_ring_t *blk_ring = blkif->blk_ring_base;
+    blkif_request_t *req;
+    BLKIF_RING_IDX i;
+    int more_to_do = 0;
+
+    /* Take items off the comms ring, taking care not to overflow. */
+    for ( i = blkif->blk_req_cons; 
+          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
+                                        BLKIF_RING_SIZE);
+          i++ )
+    {
+        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+        {
+            more_to_do = 1;
+            break;
+        }
+        
+        req = &blk_ring->ring[MASK_BLKIF_IDX(i)].req;
+        switch ( req->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            dispatch_rw_block_io(blkif, req);
+            break;
+
+        case BLKIF_OP_PROBE:
+            dispatch_probe(blkif, req);
+            break;
+
+        default:
+            DPRINTK("error: unknown block io operation [%d]\n",
+                    blk_ring->ring[i].req.operation);
+            make_response(blkif, blk_ring->ring[i].req.id, 
+                          blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
+            break;
+        }
+    }
+
+    blkif->blk_req_cons = i;
+    return more_to_do;
+}
+
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
+{
+    int rsp = BLKIF_RSP_ERROR;
+    int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+    /* We expect one buffer only. */
+    if ( unlikely(req->nr_segments != 1) )
+        goto out;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+         (blkif_last_sect(req->frame_and_sects[0]) != 7) )
+        goto out;
+
+    if ( HYPERVISOR_update_va_mapping_otherdomain(
+        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
+        0, blkif->domid) )
+        goto out;
+
+    rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
+                    PAGE_SIZE / sizeof(vdisk_t));
+
+ out:
+    fast_flush_area(pending_idx, 1);
+    make_response(blkif, req->id, req->operation, rsp);
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+    extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
+#if 0
+    struct buffer_head *bh;
+#else
+    struct bio *bio;
+#endif
+    int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+    short nr_sects;
+    unsigned long buffer, fas;
+    int i, j, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    pending_req_t *pending_req;
+    unsigned long  remap_prot;
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+
+    /* We map virtual scatter/gather segments to physical segments. */
+    int new_segs, nr_psegs = 0;
+    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
+
+    /* Check that number of segments is sane. */
+    if ( unlikely(req->nr_segments == 0) || 
+         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    {
+        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+        goto bad_descriptor;
+    }
+
+    /*
+     * Check each address/size pair is sane, and convert into a
+     * physical device and block offset. Note that if the offset and size
+     * crosses a virtual extent boundary, we may end up with more
+     * physical scatter/gather segments than virtual segments.
+     */
+    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
+    {
+        fas      = req->frame_and_sects[i];
+        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
+
+        if ( nr_sects <= 0 )
+            goto bad_descriptor;
+
+        phys_seg[nr_psegs].ps_device     = req->device;
+        phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+        phys_seg[nr_psegs].buffer        = buffer;
+        phys_seg[nr_psegs].nr_sects      = nr_sects;
+
+        /* Translate the request into the relevant 'physical device' */
+        new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
+        if ( new_segs < 0 )
+        { 
+            DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+                    operation == READ ? "read" : "write", 
+                    req->sector_number + tot_sects, 
+                    req->sector_number + tot_sects + nr_sects, 
+                    req->device); 
+            goto bad_descriptor;
+        }
+  
+        nr_psegs += new_segs;
+        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
+    }
+
+    /* Nonsensical zero-sized request? */
+    if ( unlikely(nr_psegs == 0) )
+        goto bad_descriptor;
+
+    if ( operation == READ )
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+    else
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
+        mcl[i].args[2] = 0;
+        mcl[i].args[3] = blkif->domid;
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+            phys_seg[i].buffer >> PAGE_SHIFT;
+    }
+
+    if ( unlikely(HYPERVISOR_multicall(mcl, nr_psegs) != 0) )
+        BUG();
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        if ( unlikely(mcl[i].args[5] != 0) )
+        {
+            DPRINTK("invalid buffer -- could not remap it\n");
+            fast_flush_area(pending_idx, nr_psegs);
+            goto bad_descriptor;
+        }
+    }
+
+    pending_req = &pending_reqs[pending_idx];
+    pending_req->blkif     = blkif;
+    pending_req->id        = req->id;
+    pending_req->operation = operation;
+    pending_req->status    = BLKIF_RSP_OKAY;
+    pending_req->nr_pages  = nr_psegs;
+    atomic_set(&pending_req->pendcnt, nr_psegs);
+    pending_cons++;
+
+    blkif_get(blkif);
+
+    /* Now we pass each segment down to the real blkdev layer. */
+#if 0
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
+        if ( unlikely(bh == NULL) )
+        {
+            __end_block_io_op(pending_req, 0);
+            continue;          /* XXXcl continue!? */
+        }
+        memset(bh, 0, sizeof (struct buffer_head));
+
+        init_waitqueue_head(&bh->b_wait);
+        bh->b_size          = phys_seg[i].nr_sects << 9;
+        bh->b_dev           = phys_seg[i].dev;
+        bh->b_rdev          = phys_seg[i].dev;
+        bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
+        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
+            (phys_seg[i].buffer & ~PAGE_MASK);
+        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
+        bh->b_end_io        = end_block_io_op;
+        bh->b_private       = pending_req;
+
+        bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
+            (1 << BH_Req) | (1 << BH_Launder);
+        if ( operation == WRITE )
+            bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
+
+        atomic_set(&bh->b_count, 1);
+
+        /* Dispatch a single request. We'll flush it to disc later. */
+        generic_make_request(operation, bh);
+    }
+#else
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+       int nr_iovecs = PFN_UP(phys_seg[i].nr_sects << 9);
+       ASSERT(nr_iovecs == 1);
+       bio = bio_alloc(GFP_ATOMIC, nr_iovecs);
+       if ( unlikely(bio == NULL) )
+       {
+           __end_block_io_op(pending_req, 0);
+           break;
+       }
+       bio->bi_bdev = phys_seg[i].ps_bdev;
+       bio->bi_private = pending_req;
+       bio->bi_end_io = end_block_io_op;
+       bio->bi_sector = phys_seg[i].sector_number;
+       bio->bi_rw = operation;
+
+       bio->bi_size = 0;
+
+       for ( j = 0; j < nr_iovecs; j++ )
+       {
+           struct bio_vec *bv = bio_iovec_idx(bio, j);
+
+           bv->bv_page = virt_to_page(MMAP_VADDR(pending_idx, i));
+           bv->bv_len = phys_seg[i].nr_sects << 9;
+           bv->bv_offset = phys_seg[i].buffer & ~PAGE_MASK;
+
+           bio->bi_size =+ bv->bv_len;
+           bio->bi_vcnt++;
+       }
+
+       submit_bio(operation, bio);
+    }
+#endif
+
+    return;
+
+ bad_descriptor:
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+} 
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st)
+{
+    blkif_response_t *resp;
+    unsigned long     flags;
+
+    /* Place on the response ring for the relevant domain. */ 
+    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+    resp = &blkif->blk_ring_base->
+        ring[MASK_BLKIF_IDX(blkif->blk_resp_prod)].resp;
+    resp->id        = id;
+    resp->operation = op;
+    resp->status    = st;
+    wmb();
+    blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
+    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+    /* Kick the relevant domain. */
+    notify_via_evtchn(blkif->evtchn);
+}
+
+void blkif_deschedule(blkif_t *blkif)
+{
+    remove_from_blkdev_list(blkif);
+}
+
+static int __init blkif_init(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_INITDOMAIN)
+        && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+
+    blkif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    memset(pending_reqs, 0, sizeof(pending_reqs));
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+    
+    spin_lock_init(&blkio_schedule_list_lock);
+    INIT_LIST_HEAD(&blkio_schedule_list);
+
+    if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
+        BUG();
+
+#if 0
+    buffer_head_cachep = kmem_cache_create(
+        "buffer_head_cache", sizeof(struct buffer_head),
+        0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+#endif
+
+    blkif_ctrlif_init();
+
+    return 0;
+}
+
+__initcall(blkif_init);
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/common.h b/linux-2.6.7-xen-sparse/drivers/xen/blkback/common.h
new file mode 100644 (file)
index 0000000..9fa9381
--- /dev/null
@@ -0,0 +1,124 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/common.h
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <asm-xen/blkif.h>
+#else
+#include "../blkif.h"
+#define irqreturn_t void
+#define IRQ_HANDLED
+#endif
+
+#if 0
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct blkif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+    /* Physical parameters of the comms window. */
+    unsigned long    shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+    /* Comms information. */
+    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+    BLKIF_RING_IDX     blk_req_cons;  /* Request consumer. */
+    BLKIF_RING_IDX     blk_resp_prod; /* Private version of resp. producer. */
+    /* VBDs attached to this interface. */
+    struct rb_root   vbd_rb;        /* Mapping from 16-bit vdevices to VBDs. */
+    spinlock_t       vbd_lock;      /* Protects VBD mapping. */
+    /* Private fields. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct blkif_st *hash_next;
+    struct list_head blkdev_list;
+    spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int  blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void __blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __blkif_disconnect_complete(_b);      \
+    } while (0)
+
+/* An entry in a list of xen_extents. */
+typedef struct _blkif_extent_le { 
+    blkif_extent_t extent;               /* an individual extent */
+    struct _blkif_extent_le *next;       /* and a pointer to the next */ 
+    struct block_device *bdev;
+} blkif_extent_le_t; 
+
+typedef struct _vbd { 
+    blkif_vdev_t       vdevice;   /* what the domain refers to this vbd as */
+    unsigned char      readonly;  /* Non-zero -> read-only */
+    unsigned char      type;      /* VDISK_TYPE_xxx */
+    blkif_extent_le_t *extents;   /* list of xen_extents making up this vbd */
+    struct rb_node     rb;        /* for linking into R-B tree lookup struct */
+} vbd_t; 
+
+void vbd_create(blkif_be_vbd_create_t *create); 
+void vbd_grow(blkif_be_vbd_grow_t *grow); 
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink);
+void vbd_destroy(blkif_be_vbd_destroy_t *delete); 
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
+void destroy_all_vbds(blkif_t *blkif);
+
+/* Describes a [partial] disk extent (part of a block io request) */
+typedef struct {
+    union {
+       unsigned short dev;
+       struct block_device *bdev;
+    } _dev;
+    unsigned short nr_sects;
+    unsigned long  buffer;
+    blkif_sector_t sector_number;
+} phys_seg_t;
+#define ps_device _dev.dev
+#define ps_bdev _dev.bdev
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 
+
+void blkif_interface_init(void);
+void blkif_ctrlif_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/control.c b/linux-2.6.7-xen-sparse/drivers/xen/blkback/control.c
new file mode 100644 (file)
index 0000000..0b26224
--- /dev/null
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype);
+    
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        blkif_create((blkif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_CONNECT:
+        if ( msg->length != sizeof(blkif_be_connect_t) )
+            goto parse_error;
+        blkif_connect((blkif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(blkif_be_disconnect_t) )
+            goto parse_error;
+        if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_GROW:
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_SHRINK:
+        if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
+            goto parse_error;
+        vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]);
+        break;
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void blkif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    blkif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_BE;
+    cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(blkif_be_driver_status_changed_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/interface.c b/linux-2.6.7-xen-sparse/drivers/xen/blkback/interface.c
new file mode 100644 (file)
index 0000000..3cd76b7
--- /dev/null
@@ -0,0 +1,239 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static kmem_cache_t *blkif_cachep;
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+void __blkif_disconnect_complete(blkif_t *blkif)
+{
+    ctrl_msg_t            cmsg;
+    blkif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __blkif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(blkif->evtchn);
+    vfree(blkif->blk_ring_base);
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_BLKIF_BE;
+    cmsg.subtype      = CMSG_BLKIF_BE_DISCONNECT;
+    cmsg.id           = blkif->disconnect_rspid;
+    cmsg.length       = sizeof(blkif_be_disconnect_t);
+    disc.domid        = blkif->domid;
+    disc.blkif_handle = blkif->handle;
+    disc.status       = BLKIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'blkif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( blkif->status != DISCONNECTING )
+        BUG();
+    blkif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void blkif_create(blkif_be_create_t *create)
+{
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+    {
+        DPRINTK("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;
+    spin_lock_init(&blkif->vbd_lock);
+    spin_lock_init(&blkif->blk_ring_lock);
+    atomic_set(&blkif->refcnt, 0);
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTK("Could not create blkif: already exists\n");
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            kmem_cache_free(blkif_cachep, blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
+    DPRINTK("Successfully created blkif\n");
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    destroy_all_vbds(blkif);
+    kmem_cache_free(blkif_cachep, blkif);
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_connect(blkif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->blkif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long shmem_frame = connect->shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", 
+                connect->domid, connect->blkif_handle); 
+        connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                    prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = BLKIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    if ( blkif->status != DISCONNECTED )
+    {
+        connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+        vfree(vma->addr);
+        return;
+    }
+
+    blkif->evtchn        = evtchn;
+    blkif->irq           = bind_evtchn_to_irq(evtchn);
+    blkif->shmem_frame   = shmem_frame;
+    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+    blkif->status        = CONNECTED;
+    blkif_get(blkif);
+
+    request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif);
+
+    connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->blkif_handle;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_disconnect attempted for non-existent blkif"
+                " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); 
+        disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( blkif->status == CONNECTED )
+    {
+        blkif->status = DISCONNECTING;
+        blkif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        free_irq(blkif->irq, blkif);
+        blkif_deschedule(blkif);
+        blkif_put(blkif);
+        return 0; /* Caller should not send response message. */
+    }
+
+    disconnect->status = BLKIF_BE_STATUS_OKAY;
+    return 1;
+}
+
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkback/vbd.c b/linux-2.6.7-xen-sparse/drivers/xen/blkback/vbd.c
new file mode 100644 (file)
index 0000000..bda17b1
--- /dev/null
@@ -0,0 +1,540 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/vbd.c
+ * 
+ * Routines for managing virtual block devices (VBDs).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+static dev_t vbd_map_devnum(blkif_pdev_t);
+
+void vbd_create(blkif_be_vbd_create_t *create) 
+{
+    vbd_t       *vbd; 
+    struct rb_node **rb_p, *rb_parent = NULL;
+    blkif_t     *blkif;
+    blkif_vdev_t vdevice = create->vdevice;
+
+    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", 
+                create->domid, create->blkif_handle); 
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb_p = &blkif->vbd_rb.rb_node;
+    while ( *rb_p != NULL )
+    {
+        rb_parent = *rb_p;
+        vbd = rb_entry(rb_parent, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_left;
+        }
+        else if ( vdevice > vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_right;
+        }
+        else
+        {
+            DPRINTK("vbd_create attempted for already existing vbd\n");
+            create->status = BLKIF_BE_STATUS_VBD_EXISTS;
+            goto out;
+        }
+    }
+
+    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_create: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+
+    vbd->vdevice  = vdevice; 
+    vbd->readonly = create->readonly;
+    vbd->type     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+    vbd->extents  = NULL; 
+
+    rb_link_node(&vbd->rb, rb_parent, rb_p);
+    rb_insert_color(&vbd->rb, &blkif->vbd_rb);
+
+    DPRINTK("Successful creation of vdev=%04x (dom=%u)\n",
+            vdevice, create->domid);
+    create->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
+void vbd_grow(blkif_be_vbd_grow_t *grow) 
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    struct rb_node     *rb;
+    blkif_vdev_t        vdevice = grow->vdevice;
+#if 0
+    unsigned long       sz;
+#endif
+
+
+    blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", 
+                grow->domid, grow->blkif_handle); 
+        grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
+        grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    } 
+
+    if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
+                               GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_grow: out of memory\n");
+        grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+
+    x->extent.device        = grow->extent.device;
+    /* XXXcl see comments at top of open_by_devnum */
+#if 01
+    x->bdev = open_by_devnum(vbd_map_devnum(x->extent.device),
+                            vbd->readonly ? FMODE_READ : FMODE_WRITE);
+#endif
+    /* XXXcl maybe bd_claim? */
+    x->extent.sector_start  = grow->extent.sector_start;
+    x->extent.sector_length = grow->extent.sector_length;
+    x->next                 = (blkif_extent_le_t *)NULL;
+
+#if 0
+    if( !blk_size[MAJOR(x->extent.device)] )
+    {
+        DPRINTK("vbd_grow: device %08x doesn't exist.\n", x->extent.device);
+       grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+       goto out;
+    }
+    
+    /* convert blocks (1KB) to sectors */
+    sz = blk_size[MAJOR(x->extent.device)][MINOR(x->extent.device)] * 2;    
+#endif
+
+    if ( x->extent.sector_start > 0 )
+    {
+        DPRINTK("vbd_grow: device %08x start not zero!\n", x->extent.device);
+       grow->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+       goto out;
+    }
+
+#if 0
+    /*
+     * NB. This test assumes sector_start == 0, which is always the case
+     * in Xen 1.3. In fact the whole grow/shrink interface could do with
+     * some simplification.
+     */
+    if ( x->extent.sector_length > sz )
+        x->extent.sector_length = sz;
+    
+    DPRINTK("vbd_grow: requested_len %llu actual_len %lu\n", 
+            x->extent.sector_length, sz);
+#endif
+
+    for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) 
+        continue;
+    
+    *px = x;
+
+    DPRINTK("Successful grow of vdev=%04x (dom=%u)\n",
+            vdevice, grow->domid);
+    
+    grow->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink)
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    struct rb_node     *rb;
+    blkif_vdev_t        vdevice = shrink->vdevice;
+
+    blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", 
+                shrink->domid, shrink->blkif_handle); 
+        shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    }
+
+    if ( unlikely(vbd->extents == NULL) )
+    {
+        shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+        goto out;
+    }
+
+    /* Find the last extent. We now know that there is at least one. */
+    for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next )
+        continue;
+
+    x   = *px;
+    *px = x->next;
+    kfree(x);
+
+    shrink->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy) 
+{
+    blkif_t           *blkif;
+    vbd_t             *vbd;
+    struct rb_node    *rb;
+    blkif_extent_le_t *x, *t;
+    blkif_vdev_t       vdevice = destroy->vdevice;
+
+    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
+                destroy->domid, destroy->blkif_handle); 
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+    goto out;
+
+ found:
+    rb_erase(rb, &blkif->vbd_rb);
+    x = vbd->extents;
+    kfree(vbd);
+
+    while ( x != NULL )
+    {
+        t = x->next;
+        kfree(x);
+        x = t;
+    }
+    
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void destroy_all_vbds(blkif_t *blkif)
+{
+    vbd_t *vbd;
+    struct rb_node *rb;
+    blkif_extent_le_t *x, *t;
+
+    spin_lock(&blkif->vbd_lock);
+
+    while ( (rb = blkif->vbd_rb.rb_node) != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+
+        rb_erase(rb, &blkif->vbd_rb);
+        x = vbd->extents;
+        kfree(vbd);
+        
+        while ( x != NULL )
+        {
+            t = x->next;
+            kfree(x);
+            x = t;
+        }          
+    }
+
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
+{
+    blkif_extent_le_t *x; 
+
+    vbd_info->device = vbd->vdevice; 
+    vbd_info->info   = vbd->type;
+    if ( vbd->readonly )
+        vbd_info->info |= VDISK_FLAG_RO; 
+    vbd_info->capacity = 0ULL;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+        vbd_info->capacity += x->extent.sector_length; 
+        
+    return 0;
+}
+
+
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
+{
+    int rc = 0, nr_vbds = 0;
+    struct rb_node *rb;
+
+    spin_lock(&blkif->vbd_lock);
+
+    if ( (rb = blkif->vbd_rb.rb_node) == NULL )
+        goto out;
+
+ new_subtree:
+    /* STEP 1. Find least node (it'll be left-most). */
+    while ( rb->rb_left != NULL )
+        rb = rb->rb_left;
+
+    for ( ; ; )
+    {
+        /* STEP 2. Dealt with left subtree. Now process current node. */
+        if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], 
+                                    rb_entry(rb, vbd_t, rb))) != 0 )
+            goto out;
+        if ( ++nr_vbds == max_vbds )
+            goto out;
+
+        /* STEP 3. Process right subtree, if any. */
+        if ( rb->rb_right != NULL )
+        {
+            rb = rb->rb_right;
+            goto new_subtree;
+        }
+
+        /* STEP 4. Done both subtrees. Head back through ancesstors. */
+        for ( ; ; ) 
+        {
+            /* We're done when we get back to the root node. */
+            if ( rb->rb_parent == NULL )
+                goto out;
+            /* If we are left of parent, then parent is next to process. */
+            if ( rb->rb_parent->rb_left == rb )
+                break;
+            /* If we are right of parent, then we climb to grandparent. */
+            rb = rb->rb_parent;
+        }
+
+        rb = rb->rb_parent;
+    }
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+    return (rc == 0) ? nr_vbds : rc;  
+}
+
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
+{
+    blkif_extent_le_t *x; 
+    vbd_t             *vbd;
+    struct rb_node    *rb;
+    blkif_sector_t     sec_off;
+    unsigned long      nr_secs;
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( pseg->ps_device < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( pseg->ps_device > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    DPRINTK("vbd_translate; domain %u attempted to access "
+            "non-existent VBD.\n", blkif->domid);
+
+    spin_unlock(&blkif->vbd_lock);
+    return -ENODEV; 
+
+ found:
+
+    if ( (operation == WRITE) && vbd->readonly )
+    {
+        spin_unlock(&blkif->vbd_lock);
+        return -EACCES; 
+    }
+
+    /*
+     * Now iterate through the list of blkif_extents, working out which should 
+     * be used to perform the translation.
+     */
+    sec_off = pseg->sector_number; 
+    nr_secs = pseg->nr_sects;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+    { 
+        if ( sec_off < x->extent.sector_length )
+        {
+#if 0
+            pseg->ps_device = x->extent.device;
+#else
+           pseg->ps_bdev = x->bdev;
+#endif
+            pseg->sector_number = x->extent.sector_start + sec_off;
+            if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) )
+                goto overrun;
+            spin_unlock(&blkif->vbd_lock);
+            return 1;
+        } 
+        sec_off -= x->extent.sector_length; 
+    }
+
+    DPRINTK("vbd_translate: end of vbd.\n");
+    spin_unlock(&blkif->vbd_lock);
+    return -EACCES; 
+
+    /*
+     * Here we deal with overrun onto the following extent. We don't deal with 
+     * overrun of more than one boundary since each request is restricted to 
+     * 2^9 512-byte sectors, so it should be trivial for control software to 
+     * ensure that extents are large enough to prevent excessive overrun.
+     */
+ overrun:
+
+    /* Adjust length of first chunk to run to end of first extent. */
+    pseg[0].nr_sects = x->extent.sector_length - sec_off;
+
+    /* Set second chunk buffer and length to start where first chunk ended. */
+    pseg[1].buffer   = pseg[0].buffer + (pseg[0].nr_sects << 9);
+    pseg[1].nr_sects = nr_secs - pseg[0].nr_sects;
+
+    /* Now move to the next extent. Check it exists and is long enough! */
+    if ( unlikely((x = x->next) == NULL) || 
+         unlikely(x->extent.sector_length < pseg[1].nr_sects) )
+    {
+        DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
+        spin_unlock(&blkif->vbd_lock);
+        return -EACCES;
+    }
+
+    /* Store the real device and start sector for the second chunk. */
+#if 0
+    pseg[1].ps_device     = x->extent.device;
+#else
+    pseg->ps_bdev         = x->bdev;
+#endif
+    pseg[1].sector_number = x->extent.sector_start;
+    
+    spin_unlock(&blkif->vbd_lock);
+    return 2;
+}
+
+#define MAJOR_XEN(dev) ((dev)>>8)
+#define MINOR_XEN(dev) ((dev) & 0xff)
+
+#define        XEN_IDE0_MAJOR IDE0_MAJOR
+#define        XEN_IDE1_MAJOR IDE1_MAJOR
+#define        XEN_IDE2_MAJOR IDE2_MAJOR
+#define        XEN_IDE3_MAJOR IDE3_MAJOR
+#define        XEN_IDE4_MAJOR IDE4_MAJOR
+#define        XEN_IDE5_MAJOR IDE5_MAJOR
+#define        XEN_IDE6_MAJOR IDE6_MAJOR
+#define        XEN_IDE7_MAJOR IDE7_MAJOR
+#define        XEN_IDE8_MAJOR IDE8_MAJOR
+#define        XEN_IDE9_MAJOR IDE9_MAJOR
+#define        XEN_SCSI_DISK0_MAJOR SCSI_DISK0_MAJOR
+#define        XEN_SCSI_DISK1_MAJOR SCSI_DISK1_MAJOR
+#define        XEN_SCSI_DISK2_MAJOR SCSI_DISK2_MAJOR
+#define        XEN_SCSI_DISK3_MAJOR SCSI_DISK3_MAJOR
+#define        XEN_SCSI_DISK4_MAJOR SCSI_DISK4_MAJOR
+#define        XEN_SCSI_DISK5_MAJOR SCSI_DISK5_MAJOR
+#define        XEN_SCSI_DISK6_MAJOR SCSI_DISK6_MAJOR
+#define        XEN_SCSI_DISK7_MAJOR SCSI_DISK7_MAJOR
+#define        XEN_SCSI_CDROM_MAJOR SCSI_CDROM_MAJOR
+
+static dev_t vbd_map_devnum(blkif_pdev_t cookie)
+{
+    int new_major;
+    int major = MAJOR_XEN(cookie);
+    int minor = MINOR_XEN(cookie);
+
+    switch (major) {
+    case XEN_IDE0_MAJOR: new_major = IDE0_MAJOR; break;
+    case XEN_IDE1_MAJOR: new_major = IDE1_MAJOR; break;
+    case XEN_IDE2_MAJOR: new_major = IDE2_MAJOR; break;
+    case XEN_IDE3_MAJOR: new_major = IDE3_MAJOR; break;
+    case XEN_IDE4_MAJOR: new_major = IDE4_MAJOR; break;
+    case XEN_IDE5_MAJOR: new_major = IDE5_MAJOR; break;
+    case XEN_IDE6_MAJOR: new_major = IDE6_MAJOR; break;
+    case XEN_IDE7_MAJOR: new_major = IDE7_MAJOR; break;
+    case XEN_IDE8_MAJOR: new_major = IDE8_MAJOR; break;
+    case XEN_IDE9_MAJOR: new_major = IDE9_MAJOR; break;
+    case XEN_SCSI_DISK0_MAJOR: new_major = SCSI_DISK0_MAJOR; break;
+    case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR:
+       new_major = SCSI_DISK1_MAJOR + major - XEN_SCSI_DISK1_MAJOR;
+       break;
+    case XEN_SCSI_CDROM_MAJOR: new_major = SCSI_CDROM_MAJOR; break;
+    default: new_major = 0; break;
+    }
+
+    return MKDEV(new_major, minor);
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/Kconfig
new file mode 100644 (file)
index 0000000..edde837
--- /dev/null
@@ -0,0 +1,6 @@
+
+config XENBLOCK
+       tristate "Block device driver"
+       depends on ARCH_XEN
+       help
+         Block device driver for Xen
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/Makefile
new file mode 100644 (file)
index 0000000..5d1707d
--- /dev/null
@@ -0,0 +1,3 @@
+
+obj-y  := blkfront.o vbd.o
+
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c
new file mode 100644 (file)
index 0000000..db9c6b4
--- /dev/null
@@ -0,0 +1,653 @@
+/******************************************************************************
+ * block.c
+ * 
+ * XenLinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ */
+
+#include "block.h"
+#include <linux/cdrom.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <scsi/scsi.h>
+#include <asm-xen/ctrl_if.h>
+
+typedef unsigned char byte; /* from linux/ide.h */
+
+#define BLKIF_STATE_CLOSED       0
+#define BLKIF_STATE_DISCONNECTED 1
+#define BLKIF_STATE_CONNECTED    2
+static unsigned int blkif_state = BLKIF_STATE_CLOSED;
+static unsigned int blkif_evtchn, blkif_irq;
+
+static int blkif_control_rsp_valid;
+static blkif_response_t blkif_control_rsp;
+
+static blkif_ring_t *blk_ring;
+static BLKIF_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLKIF_RING_IDX req_prod;  /* Private request producer.         */
+
+static blkif_ring_t *blk_ring_rec; /* Private copy of requests, used for
+                                    * recovery.  Responses not stored here. */
+static BLKIF_RING_IDX resp_cons_rec; /* Copy of response consumer, used for
+                                      * recovery */
+static int recovery = 0;           /* "Recovery in progress" flag.  Protected
+                                    * by the blkif_io_lock */
+
+/* We plug the I/O ring if the driver is suspended or if the ring is full. */
+#define        BLKIF_RING_FULL (((req_prod - resp_cons) == BLKIF_RING_SIZE) || \
+                        (blkif_state != BLKIF_STATE_CONNECTED))
+
+/*
+ * Request queues with outstanding work, but ring is currently full.
+ * We need no special lock here, as we always access this with the
+ * blkif_io_lock held. We only need a small maximum list.
+ */
+#define MAX_PENDING 8
+static request_queue_t *pending_queues[MAX_PENDING];
+static int nr_pending;
+
+static inline void flush_requests(void)
+{
+
+        blk_ring->req_prod = req_prod;
+
+        notify_via_evtchn(blkif_evtchn);
+}
+
+
+#if 0
+/*
+ * blkif_update_int/update-vbds_task - handle VBD update events.
+ *  Schedule a task for keventd to run, which will update the VBDs and perform 
+ *  the corresponding updates to our view of VBD state.
+ */
+static struct tq_struct update_tq;
+static void update_vbds_task(void *unused)
+{ 
+    xlvbd_update_vbds();
+}
+#endif
+
+
+int blkif_open(struct inode *inode, struct file *filep)
+{
+       struct gendisk *gd = inode->i_bdev->bd_disk;
+       struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
+
+       /* Update of usage count is protected by per-device semaphore. */
+       di->mi->usage++;
+
+       return 0;
+}
+
+
+int blkif_release(struct inode *inode, struct file *filep)
+{
+       struct gendisk *gd = inode->i_bdev->bd_disk;
+       struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
+
+       /*
+        * When usage drops to zero it may allow more VBD updates to occur.
+        * Update of usage count is protected by a per-device semaphore.
+        */
+       if (--di->mi->usage == 0) {
+#if 0
+               update_tq.routine = update_vbds_task;
+               schedule_task(&update_tq);
+#endif
+       }
+
+       return 0;
+}
+
+
+int blkif_ioctl(struct inode *inode, struct file *filep,
+                          unsigned command, unsigned long argument)
+{
+       /*  struct gendisk *gd = inode->i_bdev->bd_disk; */
+
+       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+           command, (long)argument, inode->i_rdev); 
+  
+       switch (command) {
+
+       case HDIO_GETGEO:
+               /* return ENOSYS to use defaults */
+               return -ENOSYS;
+
+       default:
+               printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+                      command);
+               return -ENOSYS;
+       }
+
+       return 0;
+}
+
+#if 0
+/* check media change: should probably do something here in some cases :-) */
+int blkif_check(kdev_t dev)
+{
+    DPRINTK("blkif_check\n");
+    return 0;
+}
+
+int blkif_revalidate(kdev_t dev)
+{
+    struct block_device *bd;
+    struct gendisk *gd;
+    xen_block_t *disk;
+    unsigned long capacity;
+    int i, rc = 0;
+    
+    if ( (bd = bdget(dev)) == NULL )
+        return -EINVAL;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(dev)) == NULL) ||
+         ((disk = xldev_to_xldisk(dev)) == NULL) ||
+         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
+    {
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( disk->usage > 1 )
+    {
+        rc = -EBUSY;
+        goto out;
+    }
+
+    /* Only reread partition table if VBDs aren't mapped to partitions. */
+    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
+    {
+        for ( i = gd->max_p - 1; i >= 0; i-- )
+        {
+            invalidate_device(dev+i, 1);
+            gd->part[MINOR(dev+i)].start_sect = 0;
+            gd->part[MINOR(dev+i)].nr_sects   = 0;
+            gd->sizes[MINOR(dev+i)]           = 0;
+        }
+
+        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+#endif
+
+
+/*
+ * blkif_queue_request
+ *
+ * request block io 
+ * 
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+       struct xlbd_disk_info *di =
+               (struct xlbd_disk_info *)req->rq_disk->private_data;
+       unsigned long buffer_ma;
+       blkif_request_t *ring_req;
+       struct bio *bio;
+       struct bio_vec *bvec;
+       int idx, s;
+        unsigned int fsect, lsect;
+
+        if (unlikely(blkif_state != BLKIF_STATE_CONNECTED))
+                return 1;
+
+       /* Fill out a communications ring structure. */
+       ring_req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
+       ring_req->id = (unsigned long)req;
+       ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
+               BLKIF_OP_READ;
+       ring_req->sector_number = (blkif_sector_t)req->sector;
+       ring_req->device = di->xd_device;
+
+       s = 0;
+       ring_req->nr_segments = 0;
+       rq_for_each_bio(bio, req) {
+               bio_for_each_segment(bvec, bio, idx) {
+                       buffer_ma =
+                                phys_to_machine(page_to_phys(bvec->bv_page));
+                       if (unlikely((buffer_ma & ((1<<9)-1)) != 0))
+                               BUG();
+
+                        fsect = bvec->bv_offset >> 9;
+                        lsect = fsect + (bvec->bv_len >> 9) - 1;
+                        if (unlikely(lsect > 7))
+                                BUG();
+
+                       ring_req->frame_and_sects[ring_req->nr_segments++] =
+                               buffer_ma | (fsect << 3) | lsect;
+                       s += bvec->bv_len >> 9;
+               }
+       }
+
+       req_prod++;
+
+        /* Keep a private copy so we can reissue requests when recovering. */
+        blk_ring_rec->ring[MASK_BLKIF_IDX(blk_ring_rec->req_prod)].req =
+                *ring_req;
+        blk_ring_rec->req_prod++;
+
+        return 0;
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(request_queue_t *rq)
+{
+       struct request *req;
+       int queued;
+
+       DPRINTK("Entered do_blkif_request\n"); 
+
+       queued = 0;
+
+       while ((req = elv_next_request(rq)) != NULL) {
+               if (!blk_fs_request(req)) {
+                       end_request(req, 0);
+                       continue;
+               }
+
+               if (BLKIF_RING_FULL) {
+                       blk_stop_queue(rq);
+                       break;
+               }
+               DPRINTK("do_blkif_request %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
+                   req, req->cmd, req->sector, req->current_nr_sectors,
+                   req->nr_sectors, req->buffer,
+                   rq_data_dir(req) ? "write" : "read");
+                blkdev_dequeue_request(req);
+               if (blkif_queue_request(req)) {
+                        blk_stop_queue(rq);
+                        break;
+                }
+               queued++;
+       }
+
+       if (queued != 0)
+               flush_requests();
+}
+
+
+static void kick_pending_request_queues(void)
+{
+    /* We kick pending request queues if the ring is reasonably empty. */
+    if ( (nr_pending != 0) && 
+         ((req_prod - resp_cons) < (BLKIF_RING_SIZE >> 1)) )
+    {
+        /* Attempt to drain the queue, but bail if the ring becomes full. */
+        while ( (nr_pending != 0) && !BLKIF_RING_FULL )
+            do_blkif_request(pending_queues[--nr_pending]);
+    }
+}
+
+
+static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+       struct request *req;
+       blkif_response_t *bret;
+       BLKIF_RING_IDX i; 
+       unsigned long flags; 
+
+       spin_lock_irqsave(&blkif_io_lock, flags);     
+
+        if (unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery)) {
+                printk("Bailed out\n");
+        
+                spin_unlock_irqrestore(&blkif_io_lock, flags);
+                return IRQ_HANDLED;
+        }
+
+       for (i = resp_cons; i != blk_ring->resp_prod; i++) {
+               bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
+               switch (bret->operation) {
+               case BLKIF_OP_READ:
+               case BLKIF_OP_WRITE:
+                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
+                               DPRINTK("Bad return from blkdev data request: %lx\n",
+                                   bret->status);
+                       req = (struct request *)bret->id;
+                        /* XXXcl pass up status */
+                       if (unlikely(end_that_request_first(req, 1,
+                           req->hard_nr_sectors)))
+                               BUG();
+
+                       end_that_request_last(req);
+                       break;
+                case BLKIF_OP_PROBE:
+                        memcpy(&blkif_control_rsp, bret, sizeof(*bret));
+                        blkif_control_rsp_valid = 1;
+                        break;
+               default:
+                       BUG();
+               }
+       }
+    
+       resp_cons = i;
+        resp_cons_rec = i;
+
+       if (xlbd_blk_queue &&
+            test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags)) {
+               blk_start_queue(xlbd_blk_queue);
+               /* XXXcl call to request_fn should not be needed but
+                 * we get stuck without...  needs investigating
+                */
+               xlbd_blk_queue->request_fn(xlbd_blk_queue);
+       }
+
+       spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+       return IRQ_HANDLED;
+}
+
+
+void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
+{
+    unsigned long flags;
+
+ retry:
+    while ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    spin_lock_irqsave(&blkif_io_lock, flags);
+    if ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
+    {
+        spin_unlock_irqrestore(&blkif_io_lock, flags);
+        goto retry;
+    }
+
+    memcpy(&blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req, req, sizeof(*req));
+    memcpy(&blk_ring_rec->ring[MASK_BLKIF_IDX(blk_ring_rec->req_prod++)].req,
+           req, sizeof(*req));
+    req_prod++;
+    flush_requests();
+
+    spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+    while ( !blkif_control_rsp_valid )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
+    blkif_control_rsp_valid = 0;
+}
+
+
+static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
+{
+    ctrl_msg_t                   cmsg;
+    blkif_fe_interface_connect_t up;
+
+    if ( status->handle != 0 )
+    {
+        printk(KERN_WARNING "Status change on unsupported blkif %d\n",
+               status->handle);
+        return;
+    }
+
+    switch ( status->status )
+    {
+    case BLKIF_INTERFACE_STATUS_DESTROYED:
+        printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n",
+               blkif_state);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
+        if ( blkif_state != BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message"
+                   " in state %d\n", blkif_state);
+
+            printk(KERN_INFO "VBD driver recovery in progress\n");
+            
+            /* Prevent new requests being issued until we fix things up. */
+            spin_lock_irq(&blkif_io_lock);
+            recovery = 1;
+            blkif_state = BLKIF_STATE_DISCONNECTED;
+            spin_unlock_irq(&blkif_io_lock);
+
+            /* Free resources associated with old device channel. */
+            free_page((unsigned long)blk_ring);
+            free_irq(blkif_irq, NULL);
+            unbind_evtchn_from_irq(blkif_evtchn);
+        }
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+        blkif_state  = BLKIF_STATE_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_BLKIF_FE;
+        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
+        up.handle      = 0;
+        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_CONNECTED:
+        if ( blkif_state == BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-CONNECTED message"
+                   " in state %d\n", blkif_state);
+            break;
+        }
+
+        blkif_evtchn = status->evtchn;
+        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
+        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
+
+        if ( recovery )
+        {
+            int i;
+
+           /* Shouldn't need the blkif_io_lock here - the device is
+            * plugged and the recovery flag prevents the interrupt handler
+            * changing anything. */
+
+            /* Reissue requests from the private block ring. */
+            for ( i = 0;
+                 resp_cons_rec < blk_ring_rec->req_prod;
+                  resp_cons_rec++, i++ )
+            {
+                blk_ring->ring[i].req
+                    = blk_ring_rec->ring[MASK_BLKIF_IDX(resp_cons_rec)].req;
+            }
+
+            /* Reset the private block ring to match the new ring. */
+            memcpy(blk_ring_rec, blk_ring, sizeof(*blk_ring));
+            resp_cons_rec = 0;
+
+            /* blk_ring->req_prod will be set when we flush_requests().*/
+            blk_ring_rec->req_prod = req_prod = i;
+
+            wmb();
+
+            /* Switch off recovery mode, using a memory barrier to ensure that
+             * it's seen before we flush requests - we don't want to miss any
+             * interrupts. */
+            recovery = 0;
+            wmb();
+
+            /* Kicks things back into life. */
+            flush_requests();
+        }
+        else
+        {
+            /* Probe for discs that are attached to the interface. */
+            xlvbd_init();
+        }
+
+        blkif_state = BLKIF_STATE_CONNECTED;
+        
+        /* Kick pending requests. */
+        spin_lock_irq(&blkif_io_lock);
+        kick_pending_request_queues();
+        spin_unlock_irq(&blkif_io_lock);
+
+        break;
+
+    default:
+        printk(KERN_WARNING "Status change to unknown value %d\n", 
+               status->status);
+        break;
+    }
+}
+
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
+        if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) )
+            goto parse_error;
+        blkif_status_change((blkif_fe_interface_status_changed_t *)
+                            &msg->msg[0]);
+        break;        
+#if 0
+    case CMSG_BLKIF_FE_VBD_STATUS_CHANGED:
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+        break;
+#endif
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+
+int __init xlblk_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    blkif_fe_driver_status_changed_t st;
+
+    if ( (start_info.flags & SIF_INITDOMAIN) 
+        || (start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+
+    printk(KERN_INFO "Initialising Xen virtual block device\n");
+
+    blk_ring_rec = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+    memset(blk_ring_rec, 0, sizeof(*blk_ring_rec));
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_FE;
+    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(blkif_fe_driver_status_changed_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+    /*
+     * We should read 'nr_interfaces' from response message and wait
+     * for notifications before proceeding. For now we assume that we
+     * will be notified of exactly one interface.
+     */
+    while ( blkif_state != BLKIF_STATE_CONNECTED )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    return 0;
+#if 0
+       int error; 
+
+       reset_xlblk_interface();
+
+       xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
+       xlblk_update_irq   = bind_virq_to_irq(VIRQ_VBD_UPD);
+
+       error = request_irq(xlblk_response_irq, xlblk_response_int, 
+                           SA_SAMPLE_RANDOM, "blkdev", NULL);
+       if (error) {
+               printk(KERN_ALERT "Could not allocate receive interrupt\n");
+               goto fail;
+       }
+
+       error = request_irq(xlblk_update_irq, xlblk_update_int,
+                           0, "blkdev", NULL);
+       if (error) {
+               printk(KERN_ALERT
+                      "Could not allocate block update interrupt\n");
+               goto fail;
+       }
+
+       (void)xlvbd_init();
+
+       return 0;
+
+ fail:
+       return error;
+#endif
+}
+
+
+static void __exit xlblk_cleanup(void)
+{
+    /* XXX FIXME */
+    BUG();
+#if 0
+       /*  xlvbd_cleanup(); */
+       free_irq(xlblk_response_irq, NULL);
+       free_irq(xlblk_update_irq, NULL);
+       unbind_virq_from_irq(VIRQ_BLKDEV);
+       unbind_virq_from_irq(VIRQ_VBD_UPD);
+#endif
+}
+
+
+module_init(xlblk_init);
+module_exit(xlblk_cleanup);
+
+
+void blkdev_suspend(void)
+{
+}
+
+
+void blkdev_resume(void)
+{
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/block.h b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/block.h
new file mode 100644 (file)
index 0000000..77002dd
--- /dev/null
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * block.h
+ * 
+ * Shared definitions between all levels of XenLinux Virtual block devices.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+
+#include <linux/devfs_fs_kernel.h>
+
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#include <asm-xen/blkif.h>
+
+#if 0
+#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct xlbd_type_info {
+       int partn_shift;
+       int devs_per_major;
+       int hardsect_size;
+       int max_sectors;
+       char *name;
+};
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct xlbd_major_info {
+       int major;
+       int usage;
+       int xd_device;
+       struct xlbd_type_info *type;
+};
+
+struct xlbd_disk_info {
+       int xd_device;
+       struct xlbd_major_info *mi;
+};
+
+typedef struct xen_block {
+       int usage;
+} xen_block_t;
+
+extern struct request_queue *xlbd_blk_queue;
+extern spinlock_t blkif_io_lock;
+
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+                           unsigned command, unsigned long argument);
+extern int blkif_check(dev_t dev);
+extern int blkif_revalidate(dev_t dev);
+extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp);
+extern void do_blkif_request (request_queue_t *rq); 
+
+extern void xlvbd_update_vbds(void);
+
+/* Virtual block-device subsystem. */
+extern int  xlvbd_init(void);
+extern void xlvbd_cleanup(void); 
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/vbd.c b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/vbd.c
new file mode 100644 (file)
index 0000000..e7c5453
--- /dev/null
@@ -0,0 +1,530 @@
+/******************************************************************************
+ * vbd.c
+ * 
+ * XenLinux virtual block-device driver (xvd).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ */
+
+#include "block.h"
+#include <linux/blkdev.h>
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.
+ * potentially combinations of the two) in the naming scheme and in a few 
+ * other places (like default readahead, etc).
+ */
+
+#define NUM_IDE_MAJORS 10
+#define NUM_SCSI_MAJORS 9
+#define NUM_VBD_MAJORS 1
+
+static struct xlbd_type_info xlbd_ide_type = {
+       .partn_shift = 6,
+       // XXXcl todo blksize_size[major]  = 1024;
+       .hardsect_size = 512,
+       .max_sectors = 128,  /* 'hwif->rqsize' if we knew it */
+       // XXXcl todo read_ahead[major]    = 8; /* from drivers/ide/ide-probe.c */
+       .name = "hd",
+};
+
+static struct xlbd_type_info xlbd_scsi_type = {
+       .partn_shift = 4,
+       // XXXcl todo blksize_size[major]  = 1024; /* XXX 512; */
+       .hardsect_size = 512,
+       .max_sectors = 128*8, /* XXX 128; */
+       // XXXcl todo read_ahead[major]    = 0; /* XXX 8; -- guessing */
+       .name = "sd",
+};
+
+static struct xlbd_type_info xlbd_vbd_type = {
+       .partn_shift = 4,
+       // XXXcl todo blksize_size[major]  = 512;
+       .hardsect_size = 512,
+       .max_sectors = 128,
+       // XXXcl todo read_ahead[major]    = 8;
+       .name = "xvd",
+};
+
+/* XXXcl handle cciss after finding out why it's "hacked" in */
+
+static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
+                                        NUM_VBD_MAJORS];
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static int nr_vbds;
+static vdisk_t *vbd_info;
+
+struct request_queue *xlbd_blk_queue = NULL;
+
+#define MAJOR_XEN(dev) ((dev)>>8)
+#define MINOR_XEN(dev) ((dev) & 0xff)
+
+static struct block_device_operations xlvbd_block_fops = 
+{
+       .owner          = THIS_MODULE,
+       .open           = blkif_open,
+       .release        = blkif_release,
+       .ioctl          = blkif_ioctl,
+#if 0
+    check_media_change: blkif_check,
+    revalidate:         blkif_revalidate,
+#endif
+};
+
+spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED;
+
+static int xlvbd_get_vbd_info(vdisk_t *disk_info)
+{
+    vdisk_t         *buf = (vdisk_t *)__get_free_page(GFP_KERNEL);
+    blkif_request_t  req;
+    blkif_response_t rsp;
+    int              nr;
+
+    memset(&req, 0, sizeof(req));
+    req.operation   = BLKIF_OP_PROBE;
+    req.nr_segments = 1;
+    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
+
+    blkif_control_send(&req, &rsp);
+
+    if ( rsp.status <= 0 )
+    {
+        printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status);
+        return -1;
+    }
+
+    if ( (nr = rsp.status) > MAX_VBDS )
+         nr = MAX_VBDS;
+    memcpy(disk_info, buf, nr * sizeof(vdisk_t));
+
+    return nr;
+}
+
+static struct xlbd_major_info *xlbd_get_major_info(int xd_device, int *minor)
+{
+       int mi_idx, new_major;
+       int xd_major = MAJOR_XEN(xd_device); 
+       int xd_minor = MINOR_XEN(xd_device);
+
+       *minor = xd_minor;
+
+       switch (xd_major) {
+       case IDE0_MAJOR: mi_idx = 0; new_major = IDE0_MAJOR; break;
+       case IDE1_MAJOR: mi_idx = 1; new_major = IDE1_MAJOR; break;
+       case IDE2_MAJOR: mi_idx = 2; new_major = IDE2_MAJOR; break;
+       case IDE3_MAJOR: mi_idx = 3; new_major = IDE3_MAJOR; break;
+       case IDE4_MAJOR: mi_idx = 4; new_major = IDE4_MAJOR; break;
+       case IDE5_MAJOR: mi_idx = 5; new_major = IDE5_MAJOR; break;
+       case IDE6_MAJOR: mi_idx = 6; new_major = IDE6_MAJOR; break;
+       case IDE7_MAJOR: mi_idx = 7; new_major = IDE7_MAJOR; break;
+       case IDE8_MAJOR: mi_idx = 8; new_major = IDE8_MAJOR; break;
+       case IDE9_MAJOR: mi_idx = 9; new_major = IDE9_MAJOR; break;
+       case SCSI_DISK0_MAJOR: mi_idx = 10; new_major = SCSI_DISK0_MAJOR; break;
+       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+               mi_idx = 11 + xd_major - SCSI_DISK1_MAJOR;
+               new_major = SCSI_DISK1_MAJOR + xd_major - SCSI_DISK1_MAJOR;
+               break;
+       case SCSI_CDROM_MAJOR: mi_idx = 18; new_major = SCSI_CDROM_MAJOR; break;
+       default: mi_idx = 19; new_major = 0;/* XXXcl notyet */ break;
+       }
+
+       if (major_info[mi_idx])
+               return major_info[mi_idx];
+
+       major_info[mi_idx] = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
+       if (major_info[mi_idx] == NULL)
+               return NULL;
+
+       memset(major_info[mi_idx], 0, sizeof(struct xlbd_major_info));
+
+       switch (mi_idx) {
+       case 0 ... (NUM_IDE_MAJORS - 1):
+               major_info[mi_idx]->type = &xlbd_ide_type;
+               break;
+       case NUM_IDE_MAJORS ... (NUM_IDE_MAJORS + NUM_SCSI_MAJORS - 1):
+               major_info[mi_idx]->type = &xlbd_scsi_type;
+               break;
+       case (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) ...
+               (NUM_IDE_MAJORS + NUM_SCSI_MAJORS + NUM_VBD_MAJORS - 1):
+               major_info[mi_idx]->type = &xlbd_vbd_type;
+               break;
+       }
+       major_info[mi_idx]->major = new_major;
+
+       if (register_blkdev(major_info[mi_idx]->major, major_info[mi_idx]->type->name)) {
+               printk(KERN_ALERT "XL VBD: can't get major %d with name %s\n",
+                   major_info[mi_idx]->major, major_info[mi_idx]->type->name);
+               goto out;
+       }
+
+       devfs_mk_dir(major_info[mi_idx]->type->name);
+
+       return major_info[mi_idx];
+
+ out:
+       kfree(major_info[mi_idx]);
+       major_info[mi_idx] = NULL;
+       return NULL;
+}
+
+static struct gendisk *xlvbd_get_gendisk(struct xlbd_major_info *mi,
+                                        int xd_minor, vdisk_t *xd)
+{
+       struct gendisk *gd;
+       struct xlbd_disk_info *di;
+       int device, partno;
+
+       device = MKDEV(mi->major, xd_minor);
+       gd = get_gendisk(device, &partno);
+       if (gd)
+               return gd;
+
+       di = kmalloc(sizeof(struct xlbd_disk_info), GFP_KERNEL);
+       if (di == NULL)
+               return NULL;
+       di->mi = mi;
+       di->xd_device = xd->device;
+
+       /* Construct an appropriate gendisk structure. */
+       gd = alloc_disk(1);
+       if (gd == NULL)
+               goto out;
+
+       gd->major = mi->major;
+       gd->first_minor = xd_minor;
+       gd->fops = &xlvbd_block_fops;
+       gd->private_data = di;
+       sprintf(gd->disk_name, "%s%c%d", mi->type->name,
+           'a' + (xd_minor >> mi->type->partn_shift),
+           xd_minor & ((1 << mi->type->partn_shift) - 1));
+       /*  sprintf(gd->devfs_name, "%s%s/disc%d", mi->type->name, , ); XXXdevfs */
+
+       set_capacity(gd, xd->capacity);
+
+       if (xlbd_blk_queue == NULL) {
+               xlbd_blk_queue = blk_init_queue(do_blkif_request,
+                                               &blkif_io_lock);
+               if (xlbd_blk_queue == NULL)
+                       goto out;
+               elevator_init(xlbd_blk_queue, &elevator_noop);
+
+               /*
+                * Turn off barking 'headactive' mode. We dequeue
+                * buffer heads as soon as we pass them to back-end
+                * driver.
+                */
+               blk_queue_headactive(xlbd_blk_queue, 0); /* XXXcl: noop according to blkdev.h */
+
+               blk_queue_hardsect_size(xlbd_blk_queue,
+                                       mi->type->hardsect_size);
+               blk_queue_max_sectors(xlbd_blk_queue, mi->type->max_sectors); /* 'hwif->rqsize' if we knew it */
+
+               /* XXXcl: set mask to PAGE_SIZE for now, to improve either use 
+                  - blk_queue_merge_bvec to merge requests with adjacent ma's
+                  - the tags infrastructure
+                  - the dma infrastructure
+               */
+               blk_queue_segment_boundary(xlbd_blk_queue, PAGE_SIZE - 1);
+
+               blk_queue_max_phys_segments(xlbd_blk_queue,
+                    BLKIF_MAX_SEGMENTS_PER_REQUEST);
+               blk_queue_max_hw_segments(xlbd_blk_queue,
+                    BLKIF_MAX_SEGMENTS_PER_REQUEST); /* XXXcl not needed? */
+
+
+       }
+       gd->queue = xlbd_blk_queue;
+
+       add_disk(gd);
+
+       return gd;
+
+ out:
+       if (gd)
+               del_gendisk(gd);
+       kfree(di);
+       return NULL;
+}
+
+/*
+ * xlvbd_init_device - initialise a VBD device
+ * @disk:              a vdisk_t describing the VBD
+ *
+ * Takes a vdisk_t * that describes a VBD the domain has access to.
+ * Performs appropriate initialisation and registration of the device.
+ *
+ * Care needs to be taken when making re-entrant calls to ensure that
+ * corruption does not occur.  Also, devices that are in use should not have
+ * their details updated.  This is the caller's responsibility.
+ */
+static int xlvbd_init_device(vdisk_t *xd)
+{
+       struct block_device *bd;
+       struct gendisk *gd;
+       struct xlbd_major_info *mi;
+       int device;
+       int minor;
+
+       int err = -ENOMEM;
+
+       mi = xlbd_get_major_info(xd->device, &minor);
+       if (mi == NULL)
+               return -EPERM;
+
+       device = MKDEV(mi->major, minor);
+
+       if ((bd = bdget(device)) == NULL)
+               return -EPERM;
+
+       /*
+        * Update of partition info, and check of usage count, is protected
+        * by the per-block-device semaphore.
+        */
+       down(&bd->bd_sem);
+
+       gd = xlvbd_get_gendisk(mi, minor, xd);
+       if (mi == NULL) {
+               err = -EPERM;
+               goto out;
+       }
+
+       if (VDISK_READONLY(xd->info))
+               set_disk_ro(gd, 1); 
+
+       /* Some final fix-ups depending on the device type */
+       switch (VDISK_TYPE(xd->info)) { 
+       case VDISK_TYPE_CDROM:
+               gd->flags |= GENHD_FL_REMOVABLE | GENHD_FL_CD; 
+               /* FALLTHROUGH */
+       case VDISK_TYPE_FLOPPY: 
+       case VDISK_TYPE_TAPE:
+               gd->flags |= GENHD_FL_REMOVABLE; 
+               break; 
+
+       case VDISK_TYPE_DISK:
+               break; 
+
+       default:
+               printk(KERN_ALERT "XenLinux: unknown device type %d\n", 
+                   VDISK_TYPE(xd->info)); 
+               break; 
+       }
+
+       err = 0;
+ out:
+       up(&bd->bd_sem);
+       bdput(bd);    
+       return err;
+}
+
+#if 0
+/*
+ * xlvbd_remove_device - remove a device node if possible
+ * @device:       numeric device ID
+ *
+ * Updates the gendisk structure and invalidates devices.
+ *
+ * This is OK for now but in future, should perhaps consider where this should
+ * deallocate gendisks / unregister devices.
+ */
+static int xlvbd_remove_device(int device)
+{
+    int i, rc = 0, minor = MINOR(device);
+    struct gendisk *gd;
+    struct block_device *bd;
+    xen_block_t *disk = NULL;
+
+    if ( (bd = bdget(device)) == NULL )
+        return -1;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(device)) == NULL) ||
+         ((disk = xldev_to_xldisk(device)) == NULL) )
+        BUG();
+
+    if ( disk->usage != 0 )
+    {
+        printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device);
+        rc = -1;
+        goto out;
+    }
+    if ( (minor & (gd->max_p-1)) != 0 )
+    {
+        /* 1: The VBD is mapped to a partition rather than a whole unit. */
+        invalidate_device(device, 1);
+       gd->part[minor].start_sect = 0;
+        gd->part[minor].nr_sects   = 0;
+        gd->sizes[minor]           = 0;
+
+        /* Clear the consists-of-virtual-partitions flag if possible. */
+        gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS;
+        for ( i = 1; i < gd->max_p; i++ )
+            if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 )
+                gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
+
+        /*
+         * If all virtual partitions are now gone, and a 'whole unit' VBD is
+         * present, then we can try to grok the unit's real partition table.
+         */
+        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
+             (gd->sizes[minor & ~(gd->max_p-1)] != 0) &&
+             !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) )
+        {
+            register_disk(gd,
+                          device&~(gd->max_p-1), 
+                          gd->max_p, 
+                          &xlvbd_block_fops,
+                          gd->part[minor&~(gd->max_p-1)].nr_sects);
+        }
+    }
+    else
+    {
+        /*
+         * 2: The VBD is mapped to an entire 'unit'. Clear all partitions.
+         * NB. The partition entries are only cleared if there are no VBDs
+         * mapped to individual partitions on this unit.
+         */
+        i = gd->max_p - 1; /* Default: clear subpartitions as well. */
+        if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
+            i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */
+        while ( i >= 0 )
+        {
+            invalidate_device(device+i, 1);
+            gd->part[minor+i].start_sect = 0;
+            gd->part[minor+i].nr_sects   = 0;
+            gd->sizes[minor+i]           = 0;
+            i--;
+        }
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+/*
+ * xlvbd_update_vbds - reprobes the VBD status and performs updates driver
+ * state. The VBDs need to be updated in this way when the domain is
+ * initialised and also each time we receive an XLBLK_UPDATE event.
+ */
+void xlvbd_update_vbds(void)
+{
+    int i, j, k, old_nr, new_nr;
+    vdisk_t *old_info, *new_info, *merged_info;
+
+    old_info = vbd_info;
+    old_nr   = nr_vbds;
+
+    new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
+    if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
+    {
+        kfree(new_info);
+        return;
+    }
+
+    /*
+     * Final list maximum size is old list + new list. This occurs only when
+     * old list and new list do not overlap at all, and we cannot yet destroy
+     * VBDs in the old list because the usage counts are busy.
+     */
+    merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
+
+    /* @i tracks old list; @j tracks new list; @k tracks merged list. */
+    i = j = k = 0;
+
+    while ( (i < old_nr) && (j < new_nr) )
+    {
+        if ( old_info[i].device < new_info[j].device )
+        {
+            if ( xlvbd_remove_device(old_info[i].device) != 0 )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+            i++;
+        }
+        else if ( old_info[i].device > new_info[j].device )
+        {
+            if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+            j++;
+        }
+        else
+        {
+            if ( ((old_info[i].capacity == new_info[j].capacity) &&
+                  (old_info[i].info == new_info[j].info)) ||
+                 (xlvbd_remove_device(old_info[i].device) != 0) )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+            else if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+            i++; j++;
+        }
+    }
+
+    for ( ; i < old_nr; i++ )
+    {
+        if ( xlvbd_remove_device(old_info[i].device) != 0 )
+            memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+    }
+
+    for ( ; j < new_nr; j++ )
+    {
+        if ( xlvbd_init_device(&new_info[j]) == 0 )
+            memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+    }
+
+    vbd_info = merged_info;
+    nr_vbds  = k;
+
+    kfree(old_info);
+    kfree(new_info);
+}
+#endif
+
+/*
+ * Set up all the linux device goop for the virtual block devices
+ * (vbd's) that we know about. Note that although from the backend
+ * driver's p.o.v. VBDs are addressed simply an opaque 16-bit device
+ * number, the domain creation tools conventionally allocate these
+ * numbers to correspond to those used by 'real' linux -- this is just
+ * for convenience as it means e.g. that the same /etc/fstab can be
+ * used when booting with or without Xen.
+ */
+int xlvbd_init(void)
+{
+       int i;
+
+       /*
+        * If compiled as a module, we don't support unloading yet. We
+        * therefore permanently increment the reference count to
+        * disallow it.
+        */
+       MOD_INC_USE_COUNT;
+
+       memset(major_info, 0, sizeof(major_info));
+
+       for (i = 0; i < sizeof(major_info) / sizeof(major_info[0]); i++) {
+       }
+
+       vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
+       nr_vbds  = xlvbd_get_vbd_info(vbd_info);
+
+       if (nr_vbds < 0) {
+               kfree(vbd_info);
+               vbd_info = NULL;
+               nr_vbds  = 0;
+       } else {
+               for (i = 0; i < nr_vbds; i++)
+                       xlvbd_init_device(&vbd_info[i]);
+       }
+
+       return 0;
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/block/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/block/Kconfig
deleted file mode 100644 (file)
index edde837..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-
-config XENBLOCK
-       tristate "Block device driver"
-       depends on ARCH_XEN
-       help
-         Block device driver for Xen
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/block/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/block/Makefile
deleted file mode 100644 (file)
index de77b96..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-
-obj-y  := vbd.o block.o
-
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/block/block.c b/linux-2.6.7-xen-sparse/drivers/xen/block/block.c
deleted file mode 100644 (file)
index db9c6b4..0000000
+++ /dev/null
@@ -1,653 +0,0 @@
-/******************************************************************************
- * block.c
- * 
- * XenLinux virtual block-device driver.
- * 
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
- */
-
-#include "block.h"
-#include <linux/cdrom.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <scsi/scsi.h>
-#include <asm-xen/ctrl_if.h>
-
-typedef unsigned char byte; /* from linux/ide.h */
-
-#define BLKIF_STATE_CLOSED       0
-#define BLKIF_STATE_DISCONNECTED 1
-#define BLKIF_STATE_CONNECTED    2
-static unsigned int blkif_state = BLKIF_STATE_CLOSED;
-static unsigned int blkif_evtchn, blkif_irq;
-
-static int blkif_control_rsp_valid;
-static blkif_response_t blkif_control_rsp;
-
-static blkif_ring_t *blk_ring;
-static BLKIF_RING_IDX resp_cons; /* Response consumer for comms ring. */
-static BLKIF_RING_IDX req_prod;  /* Private request producer.         */
-
-static blkif_ring_t *blk_ring_rec; /* Private copy of requests, used for
-                                    * recovery.  Responses not stored here. */
-static BLKIF_RING_IDX resp_cons_rec; /* Copy of response consumer, used for
-                                      * recovery */
-static int recovery = 0;           /* "Recovery in progress" flag.  Protected
-                                    * by the blkif_io_lock */
-
-/* We plug the I/O ring if the driver is suspended or if the ring is full. */
-#define        BLKIF_RING_FULL (((req_prod - resp_cons) == BLKIF_RING_SIZE) || \
-                        (blkif_state != BLKIF_STATE_CONNECTED))
-
-/*
- * Request queues with outstanding work, but ring is currently full.
- * We need no special lock here, as we always access this with the
- * blkif_io_lock held. We only need a small maximum list.
- */
-#define MAX_PENDING 8
-static request_queue_t *pending_queues[MAX_PENDING];
-static int nr_pending;
-
-static inline void flush_requests(void)
-{
-
-        blk_ring->req_prod = req_prod;
-
-        notify_via_evtchn(blkif_evtchn);
-}
-
-
-#if 0
-/*
- * blkif_update_int/update-vbds_task - handle VBD update events.
- *  Schedule a task for keventd to run, which will update the VBDs and perform 
- *  the corresponding updates to our view of VBD state.
- */
-static struct tq_struct update_tq;
-static void update_vbds_task(void *unused)
-{ 
-    xlvbd_update_vbds();
-}
-#endif
-
-
-int blkif_open(struct inode *inode, struct file *filep)
-{
-       struct gendisk *gd = inode->i_bdev->bd_disk;
-       struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
-
-       /* Update of usage count is protected by per-device semaphore. */
-       di->mi->usage++;
-
-       return 0;
-}
-
-
-int blkif_release(struct inode *inode, struct file *filep)
-{
-       struct gendisk *gd = inode->i_bdev->bd_disk;
-       struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
-
-       /*
-        * When usage drops to zero it may allow more VBD updates to occur.
-        * Update of usage count is protected by a per-device semaphore.
-        */
-       if (--di->mi->usage == 0) {
-#if 0
-               update_tq.routine = update_vbds_task;
-               schedule_task(&update_tq);
-#endif
-       }
-
-       return 0;
-}
-
-
-int blkif_ioctl(struct inode *inode, struct file *filep,
-                          unsigned command, unsigned long argument)
-{
-       /*  struct gendisk *gd = inode->i_bdev->bd_disk; */
-
-       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
-           command, (long)argument, inode->i_rdev); 
-  
-       switch (command) {
-
-       case HDIO_GETGEO:
-               /* return ENOSYS to use defaults */
-               return -ENOSYS;
-
-       default:
-               printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
-                      command);
-               return -ENOSYS;
-       }
-
-       return 0;
-}
-
-#if 0
-/* check media change: should probably do something here in some cases :-) */
-int blkif_check(kdev_t dev)
-{
-    DPRINTK("blkif_check\n");
-    return 0;
-}
-
-int blkif_revalidate(kdev_t dev)
-{
-    struct block_device *bd;
-    struct gendisk *gd;
-    xen_block_t *disk;
-    unsigned long capacity;
-    int i, rc = 0;
-    
-    if ( (bd = bdget(dev)) == NULL )
-        return -EINVAL;
-
-    /*
-     * Update of partition info, and check of usage count, is protected
-     * by the per-block-device semaphore.
-     */
-    down(&bd->bd_sem);
-
-    if ( ((gd = get_gendisk(dev)) == NULL) ||
-         ((disk = xldev_to_xldisk(dev)) == NULL) ||
-         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
-    {
-        rc = -EINVAL;
-        goto out;
-    }
-
-    if ( disk->usage > 1 )
-    {
-        rc = -EBUSY;
-        goto out;
-    }
-
-    /* Only reread partition table if VBDs aren't mapped to partitions. */
-    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
-    {
-        for ( i = gd->max_p - 1; i >= 0; i-- )
-        {
-            invalidate_device(dev+i, 1);
-            gd->part[MINOR(dev+i)].start_sect = 0;
-            gd->part[MINOR(dev+i)].nr_sects   = 0;
-            gd->sizes[MINOR(dev+i)]           = 0;
-        }
-
-        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
-    }
-
- out:
-    up(&bd->bd_sem);
-    bdput(bd);
-    return rc;
-}
-#endif
-
-
-/*
- * blkif_queue_request
- *
- * request block io 
- * 
- * id: for guest use only.
- * operation: BLKIF_OP_{READ,WRITE,PROBE}
- * buffer: buffer to read/write into. this should be a
- *   virtual address in the guest os.
- */
-static int blkif_queue_request(struct request *req)
-{
-       struct xlbd_disk_info *di =
-               (struct xlbd_disk_info *)req->rq_disk->private_data;
-       unsigned long buffer_ma;
-       blkif_request_t *ring_req;
-       struct bio *bio;
-       struct bio_vec *bvec;
-       int idx, s;
-        unsigned int fsect, lsect;
-
-        if (unlikely(blkif_state != BLKIF_STATE_CONNECTED))
-                return 1;
-
-       /* Fill out a communications ring structure. */
-       ring_req = &blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req;
-       ring_req->id = (unsigned long)req;
-       ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
-               BLKIF_OP_READ;
-       ring_req->sector_number = (blkif_sector_t)req->sector;
-       ring_req->device = di->xd_device;
-
-       s = 0;
-       ring_req->nr_segments = 0;
-       rq_for_each_bio(bio, req) {
-               bio_for_each_segment(bvec, bio, idx) {
-                       buffer_ma =
-                                phys_to_machine(page_to_phys(bvec->bv_page));
-                       if (unlikely((buffer_ma & ((1<<9)-1)) != 0))
-                               BUG();
-
-                        fsect = bvec->bv_offset >> 9;
-                        lsect = fsect + (bvec->bv_len >> 9) - 1;
-                        if (unlikely(lsect > 7))
-                                BUG();
-
-                       ring_req->frame_and_sects[ring_req->nr_segments++] =
-                               buffer_ma | (fsect << 3) | lsect;
-                       s += bvec->bv_len >> 9;
-               }
-       }
-
-       req_prod++;
-
-        /* Keep a private copy so we can reissue requests when recovering. */
-        blk_ring_rec->ring[MASK_BLKIF_IDX(blk_ring_rec->req_prod)].req =
-                *ring_req;
-        blk_ring_rec->req_prod++;
-
-        return 0;
-}
-
-/*
- * do_blkif_request
- *  read a block; request is in a request queue
- */
-void do_blkif_request(request_queue_t *rq)
-{
-       struct request *req;
-       int queued;
-
-       DPRINTK("Entered do_blkif_request\n"); 
-
-       queued = 0;
-
-       while ((req = elv_next_request(rq)) != NULL) {
-               if (!blk_fs_request(req)) {
-                       end_request(req, 0);
-                       continue;
-               }
-
-               if (BLKIF_RING_FULL) {
-                       blk_stop_queue(rq);
-                       break;
-               }
-               DPRINTK("do_blkif_request %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
-                   req, req->cmd, req->sector, req->current_nr_sectors,
-                   req->nr_sectors, req->buffer,
-                   rq_data_dir(req) ? "write" : "read");
-                blkdev_dequeue_request(req);
-               if (blkif_queue_request(req)) {
-                        blk_stop_queue(rq);
-                        break;
-                }
-               queued++;
-       }
-
-       if (queued != 0)
-               flush_requests();
-}
-
-
-static void kick_pending_request_queues(void)
-{
-    /* We kick pending request queues if the ring is reasonably empty. */
-    if ( (nr_pending != 0) && 
-         ((req_prod - resp_cons) < (BLKIF_RING_SIZE >> 1)) )
-    {
-        /* Attempt to drain the queue, but bail if the ring becomes full. */
-        while ( (nr_pending != 0) && !BLKIF_RING_FULL )
-            do_blkif_request(pending_queues[--nr_pending]);
-    }
-}
-
-
-static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-       struct request *req;
-       blkif_response_t *bret;
-       BLKIF_RING_IDX i; 
-       unsigned long flags; 
-
-       spin_lock_irqsave(&blkif_io_lock, flags);     
-
-        if (unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery)) {
-                printk("Bailed out\n");
-        
-                spin_unlock_irqrestore(&blkif_io_lock, flags);
-                return IRQ_HANDLED;
-        }
-
-       for (i = resp_cons; i != blk_ring->resp_prod; i++) {
-               bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
-               switch (bret->operation) {
-               case BLKIF_OP_READ:
-               case BLKIF_OP_WRITE:
-                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
-                               DPRINTK("Bad return from blkdev data request: %lx\n",
-                                   bret->status);
-                       req = (struct request *)bret->id;
-                        /* XXXcl pass up status */
-                       if (unlikely(end_that_request_first(req, 1,
-                           req->hard_nr_sectors)))
-                               BUG();
-
-                       end_that_request_last(req);
-                       break;
-                case BLKIF_OP_PROBE:
-                        memcpy(&blkif_control_rsp, bret, sizeof(*bret));
-                        blkif_control_rsp_valid = 1;
-                        break;
-               default:
-                       BUG();
-               }
-       }
-    
-       resp_cons = i;
-        resp_cons_rec = i;
-
-       if (xlbd_blk_queue &&
-            test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags)) {
-               blk_start_queue(xlbd_blk_queue);
-               /* XXXcl call to request_fn should not be needed but
-                 * we get stuck without...  needs investigating
-                */
-               xlbd_blk_queue->request_fn(xlbd_blk_queue);
-       }
-
-       spin_unlock_irqrestore(&blkif_io_lock, flags);
-
-       return IRQ_HANDLED;
-}
-
-
-void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
-{
-    unsigned long flags;
-
- retry:
-    while ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
-    {
-        set_current_state(TASK_INTERRUPTIBLE);
-        schedule_timeout(1);
-    }
-
-    spin_lock_irqsave(&blkif_io_lock, flags);
-    if ( (req_prod - resp_cons) == BLKIF_RING_SIZE )
-    {
-        spin_unlock_irqrestore(&blkif_io_lock, flags);
-        goto retry;
-    }
-
-    memcpy(&blk_ring->ring[MASK_BLKIF_IDX(req_prod)].req, req, sizeof(*req));
-    memcpy(&blk_ring_rec->ring[MASK_BLKIF_IDX(blk_ring_rec->req_prod++)].req,
-           req, sizeof(*req));
-    req_prod++;
-    flush_requests();
-
-    spin_unlock_irqrestore(&blkif_io_lock, flags);
-
-    while ( !blkif_control_rsp_valid )
-    {
-        set_current_state(TASK_INTERRUPTIBLE);
-        schedule_timeout(1);
-    }
-
-    memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
-    blkif_control_rsp_valid = 0;
-}
-
-
-static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
-{
-    ctrl_msg_t                   cmsg;
-    blkif_fe_interface_connect_t up;
-
-    if ( status->handle != 0 )
-    {
-        printk(KERN_WARNING "Status change on unsupported blkif %d\n",
-               status->handle);
-        return;
-    }
-
-    switch ( status->status )
-    {
-    case BLKIF_INTERFACE_STATUS_DESTROYED:
-        printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n",
-               blkif_state);
-        break;
-
-    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
-        if ( blkif_state != BLKIF_STATE_CLOSED )
-        {
-            printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message"
-                   " in state %d\n", blkif_state);
-
-            printk(KERN_INFO "VBD driver recovery in progress\n");
-            
-            /* Prevent new requests being issued until we fix things up. */
-            spin_lock_irq(&blkif_io_lock);
-            recovery = 1;
-            blkif_state = BLKIF_STATE_DISCONNECTED;
-            spin_unlock_irq(&blkif_io_lock);
-
-            /* Free resources associated with old device channel. */
-            free_page((unsigned long)blk_ring);
-            free_irq(blkif_irq, NULL);
-            unbind_evtchn_from_irq(blkif_evtchn);
-        }
-
-        /* Move from CLOSED to DISCONNECTED state. */
-        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
-        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
-        blkif_state  = BLKIF_STATE_DISCONNECTED;
-
-        /* Construct an interface-CONNECT message for the domain controller. */
-        cmsg.type      = CMSG_BLKIF_FE;
-        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
-        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
-        up.handle      = 0;
-        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
-        memcpy(cmsg.msg, &up, sizeof(up));
-        
-        /* Tell the controller to bring up the interface. */
-        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-        break;
-
-    case BLKIF_INTERFACE_STATUS_CONNECTED:
-        if ( blkif_state == BLKIF_STATE_CLOSED )
-        {
-            printk(KERN_WARNING "Unexpected blkif-CONNECTED message"
-                   " in state %d\n", blkif_state);
-            break;
-        }
-
-        blkif_evtchn = status->evtchn;
-        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
-        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
-
-        if ( recovery )
-        {
-            int i;
-
-           /* Shouldn't need the blkif_io_lock here - the device is
-            * plugged and the recovery flag prevents the interrupt handler
-            * changing anything. */
-
-            /* Reissue requests from the private block ring. */
-            for ( i = 0;
-                 resp_cons_rec < blk_ring_rec->req_prod;
-                  resp_cons_rec++, i++ )
-            {
-                blk_ring->ring[i].req
-                    = blk_ring_rec->ring[MASK_BLKIF_IDX(resp_cons_rec)].req;
-            }
-
-            /* Reset the private block ring to match the new ring. */
-            memcpy(blk_ring_rec, blk_ring, sizeof(*blk_ring));
-            resp_cons_rec = 0;
-
-            /* blk_ring->req_prod will be set when we flush_requests().*/
-            blk_ring_rec->req_prod = req_prod = i;
-
-            wmb();
-
-            /* Switch off recovery mode, using a memory barrier to ensure that
-             * it's seen before we flush requests - we don't want to miss any
-             * interrupts. */
-            recovery = 0;
-            wmb();
-
-            /* Kicks things back into life. */
-            flush_requests();
-        }
-        else
-        {
-            /* Probe for discs that are attached to the interface. */
-            xlvbd_init();
-        }
-
-        blkif_state = BLKIF_STATE_CONNECTED;
-        
-        /* Kick pending requests. */
-        spin_lock_irq(&blkif_io_lock);
-        kick_pending_request_queues();
-        spin_unlock_irq(&blkif_io_lock);
-
-        break;
-
-    default:
-        printk(KERN_WARNING "Status change to unknown value %d\n", 
-               status->status);
-        break;
-    }
-}
-
-
-static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
-    switch ( msg->subtype )
-    {
-    case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
-        if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) )
-            goto parse_error;
-        blkif_status_change((blkif_fe_interface_status_changed_t *)
-                            &msg->msg[0]);
-        break;        
-#if 0
-    case CMSG_BLKIF_FE_VBD_STATUS_CHANGED:
-        update_tq.routine = update_vbds_task;
-        schedule_task(&update_tq);
-        break;
-#endif
-    default:
-        goto parse_error;
-    }
-
-    ctrl_if_send_response(msg);
-    return;
-
- parse_error:
-    msg->length = 0;
-    ctrl_if_send_response(msg);
-}
-
-
-int __init xlblk_init(void)
-{
-    ctrl_msg_t                       cmsg;
-    blkif_fe_driver_status_changed_t st;
-
-    if ( (start_info.flags & SIF_INITDOMAIN) 
-        || (start_info.flags & SIF_BLK_BE_DOMAIN) )
-        return 0;
-
-    printk(KERN_INFO "Initialising Xen virtual block device\n");
-
-    blk_ring_rec = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
-    memset(blk_ring_rec, 0, sizeof(*blk_ring_rec));
-
-    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
-                                    CALLBACK_IN_BLOCKING_CONTEXT);
-
-    /* Send a driver-UP notification to the domain controller. */
-    cmsg.type      = CMSG_BLKIF_FE;
-    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED;
-    cmsg.length    = sizeof(blkif_fe_driver_status_changed_t);
-    st.status      = BLKIF_DRIVER_STATUS_UP;
-    memcpy(cmsg.msg, &st, sizeof(st));
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-
-    /*
-     * We should read 'nr_interfaces' from response message and wait
-     * for notifications before proceeding. For now we assume that we
-     * will be notified of exactly one interface.
-     */
-    while ( blkif_state != BLKIF_STATE_CONNECTED )
-    {
-        set_current_state(TASK_INTERRUPTIBLE);
-        schedule_timeout(1);
-    }
-
-    return 0;
-#if 0
-       int error; 
-
-       reset_xlblk_interface();
-
-       xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
-       xlblk_update_irq   = bind_virq_to_irq(VIRQ_VBD_UPD);
-
-       error = request_irq(xlblk_response_irq, xlblk_response_int, 
-                           SA_SAMPLE_RANDOM, "blkdev", NULL);
-       if (error) {
-               printk(KERN_ALERT "Could not allocate receive interrupt\n");
-               goto fail;
-       }
-
-       error = request_irq(xlblk_update_irq, xlblk_update_int,
-                           0, "blkdev", NULL);
-       if (error) {
-               printk(KERN_ALERT
-                      "Could not allocate block update interrupt\n");
-               goto fail;
-       }
-
-       (void)xlvbd_init();
-
-       return 0;
-
- fail:
-       return error;
-#endif
-}
-
-
-static void __exit xlblk_cleanup(void)
-{
-    /* XXX FIXME */
-    BUG();
-#if 0
-       /*  xlvbd_cleanup(); */
-       free_irq(xlblk_response_irq, NULL);
-       free_irq(xlblk_update_irq, NULL);
-       unbind_virq_from_irq(VIRQ_BLKDEV);
-       unbind_virq_from_irq(VIRQ_VBD_UPD);
-#endif
-}
-
-
-module_init(xlblk_init);
-module_exit(xlblk_cleanup);
-
-
-void blkdev_suspend(void)
-{
-}
-
-
-void blkdev_resume(void)
-{
-}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/block/block.h b/linux-2.6.7-xen-sparse/drivers/xen/block/block.h
deleted file mode 100644 (file)
index 77002dd..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-/******************************************************************************
- * block.h
- * 
- * Shared definitions between all levels of XenLinux Virtual block devices.
- */
-
-#ifndef __XEN_DRIVERS_BLOCK_H__
-#define __XEN_DRIVERS_BLOCK_H__
-
-#include <linux/config.h>
-#include <linux/module.h>
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-
-#include <linux/fs.h>
-#include <linux/hdreg.h>
-#include <linux/blkdev.h>
-#include <linux/major.h>
-
-#include <linux/devfs_fs_kernel.h>
-
-#include <asm/hypervisor-ifs/hypervisor-if.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
-
-#include <asm-xen/blkif.h>
-
-#if 0
-#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
-#if 0
-#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
-#else
-#define DPRINTK_IOCTL(_f, _a...) ((void)0)
-#endif
-
-struct xlbd_type_info {
-       int partn_shift;
-       int devs_per_major;
-       int hardsect_size;
-       int max_sectors;
-       char *name;
-};
-
-/*
- * We have one of these per vbd, whether ide, scsi or 'other'.  They
- * hang in private_data off the gendisk structure. We may end up
- * putting all kinds of interesting stuff here :-)
- */
-struct xlbd_major_info {
-       int major;
-       int usage;
-       int xd_device;
-       struct xlbd_type_info *type;
-};
-
-struct xlbd_disk_info {
-       int xd_device;
-       struct xlbd_major_info *mi;
-};
-
-typedef struct xen_block {
-       int usage;
-} xen_block_t;
-
-extern struct request_queue *xlbd_blk_queue;
-extern spinlock_t blkif_io_lock;
-
-extern int blkif_open(struct inode *inode, struct file *filep);
-extern int blkif_release(struct inode *inode, struct file *filep);
-extern int blkif_ioctl(struct inode *inode, struct file *filep,
-                           unsigned command, unsigned long argument);
-extern int blkif_check(dev_t dev);
-extern int blkif_revalidate(dev_t dev);
-extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp);
-extern void do_blkif_request (request_queue_t *rq); 
-
-extern void xlvbd_update_vbds(void);
-
-/* Virtual block-device subsystem. */
-extern int  xlvbd_init(void);
-extern void xlvbd_cleanup(void); 
-
-#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/block/vbd.c b/linux-2.6.7-xen-sparse/drivers/xen/block/vbd.c
deleted file mode 100644 (file)
index e7c5453..0000000
+++ /dev/null
@@ -1,530 +0,0 @@
-/******************************************************************************
- * vbd.c
- * 
- * XenLinux virtual block-device driver (xvd).
- * 
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
- */
-
-#include "block.h"
-#include <linux/blkdev.h>
-
-/*
- * For convenience we distinguish between ide, scsi and 'other' (i.e.
- * potentially combinations of the two) in the naming scheme and in a few 
- * other places (like default readahead, etc).
- */
-
-#define NUM_IDE_MAJORS 10
-#define NUM_SCSI_MAJORS 9
-#define NUM_VBD_MAJORS 1
-
-static struct xlbd_type_info xlbd_ide_type = {
-       .partn_shift = 6,
-       // XXXcl todo blksize_size[major]  = 1024;
-       .hardsect_size = 512,
-       .max_sectors = 128,  /* 'hwif->rqsize' if we knew it */
-       // XXXcl todo read_ahead[major]    = 8; /* from drivers/ide/ide-probe.c */
-       .name = "hd",
-};
-
-static struct xlbd_type_info xlbd_scsi_type = {
-       .partn_shift = 4,
-       // XXXcl todo blksize_size[major]  = 1024; /* XXX 512; */
-       .hardsect_size = 512,
-       .max_sectors = 128*8, /* XXX 128; */
-       // XXXcl todo read_ahead[major]    = 0; /* XXX 8; -- guessing */
-       .name = "sd",
-};
-
-static struct xlbd_type_info xlbd_vbd_type = {
-       .partn_shift = 4,
-       // XXXcl todo blksize_size[major]  = 512;
-       .hardsect_size = 512,
-       .max_sectors = 128,
-       // XXXcl todo read_ahead[major]    = 8;
-       .name = "xvd",
-};
-
-/* XXXcl handle cciss after finding out why it's "hacked" in */
-
-static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
-                                        NUM_VBD_MAJORS];
-
-/* Information about our VBDs. */
-#define MAX_VBDS 64
-static int nr_vbds;
-static vdisk_t *vbd_info;
-
-struct request_queue *xlbd_blk_queue = NULL;
-
-#define MAJOR_XEN(dev) ((dev)>>8)
-#define MINOR_XEN(dev) ((dev) & 0xff)
-
-static struct block_device_operations xlvbd_block_fops = 
-{
-       .owner          = THIS_MODULE,
-       .open           = blkif_open,
-       .release        = blkif_release,
-       .ioctl          = blkif_ioctl,
-#if 0
-    check_media_change: blkif_check,
-    revalidate:         blkif_revalidate,
-#endif
-};
-
-spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED;
-
-static int xlvbd_get_vbd_info(vdisk_t *disk_info)
-{
-    vdisk_t         *buf = (vdisk_t *)__get_free_page(GFP_KERNEL);
-    blkif_request_t  req;
-    blkif_response_t rsp;
-    int              nr;
-
-    memset(&req, 0, sizeof(req));
-    req.operation   = BLKIF_OP_PROBE;
-    req.nr_segments = 1;
-    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
-
-    blkif_control_send(&req, &rsp);
-
-    if ( rsp.status <= 0 )
-    {
-        printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status);
-        return -1;
-    }
-
-    if ( (nr = rsp.status) > MAX_VBDS )
-         nr = MAX_VBDS;
-    memcpy(disk_info, buf, nr * sizeof(vdisk_t));
-
-    return nr;
-}
-
-static struct xlbd_major_info *xlbd_get_major_info(int xd_device, int *minor)
-{
-       int mi_idx, new_major;
-       int xd_major = MAJOR_XEN(xd_device); 
-       int xd_minor = MINOR_XEN(xd_device);
-
-       *minor = xd_minor;
-
-       switch (xd_major) {
-       case IDE0_MAJOR: mi_idx = 0; new_major = IDE0_MAJOR; break;
-       case IDE1_MAJOR: mi_idx = 1; new_major = IDE1_MAJOR; break;
-       case IDE2_MAJOR: mi_idx = 2; new_major = IDE2_MAJOR; break;
-       case IDE3_MAJOR: mi_idx = 3; new_major = IDE3_MAJOR; break;
-       case IDE4_MAJOR: mi_idx = 4; new_major = IDE4_MAJOR; break;
-       case IDE5_MAJOR: mi_idx = 5; new_major = IDE5_MAJOR; break;
-       case IDE6_MAJOR: mi_idx = 6; new_major = IDE6_MAJOR; break;
-       case IDE7_MAJOR: mi_idx = 7; new_major = IDE7_MAJOR; break;
-       case IDE8_MAJOR: mi_idx = 8; new_major = IDE8_MAJOR; break;
-       case IDE9_MAJOR: mi_idx = 9; new_major = IDE9_MAJOR; break;
-       case SCSI_DISK0_MAJOR: mi_idx = 10; new_major = SCSI_DISK0_MAJOR; break;
-       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
-               mi_idx = 11 + xd_major - SCSI_DISK1_MAJOR;
-               new_major = SCSI_DISK1_MAJOR + xd_major - SCSI_DISK1_MAJOR;
-               break;
-       case SCSI_CDROM_MAJOR: mi_idx = 18; new_major = SCSI_CDROM_MAJOR; break;
-       default: mi_idx = 19; new_major = 0;/* XXXcl notyet */ break;
-       }
-
-       if (major_info[mi_idx])
-               return major_info[mi_idx];
-
-       major_info[mi_idx] = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
-       if (major_info[mi_idx] == NULL)
-               return NULL;
-
-       memset(major_info[mi_idx], 0, sizeof(struct xlbd_major_info));
-
-       switch (mi_idx) {
-       case 0 ... (NUM_IDE_MAJORS - 1):
-               major_info[mi_idx]->type = &xlbd_ide_type;
-               break;
-       case NUM_IDE_MAJORS ... (NUM_IDE_MAJORS + NUM_SCSI_MAJORS - 1):
-               major_info[mi_idx]->type = &xlbd_scsi_type;
-               break;
-       case (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) ...
-               (NUM_IDE_MAJORS + NUM_SCSI_MAJORS + NUM_VBD_MAJORS - 1):
-               major_info[mi_idx]->type = &xlbd_vbd_type;
-               break;
-       }
-       major_info[mi_idx]->major = new_major;
-
-       if (register_blkdev(major_info[mi_idx]->major, major_info[mi_idx]->type->name)) {
-               printk(KERN_ALERT "XL VBD: can't get major %d with name %s\n",
-                   major_info[mi_idx]->major, major_info[mi_idx]->type->name);
-               goto out;
-       }
-
-       devfs_mk_dir(major_info[mi_idx]->type->name);
-
-       return major_info[mi_idx];
-
- out:
-       kfree(major_info[mi_idx]);
-       major_info[mi_idx] = NULL;
-       return NULL;
-}
-
-static struct gendisk *xlvbd_get_gendisk(struct xlbd_major_info *mi,
-                                        int xd_minor, vdisk_t *xd)
-{
-       struct gendisk *gd;
-       struct xlbd_disk_info *di;
-       int device, partno;
-
-       device = MKDEV(mi->major, xd_minor);
-       gd = get_gendisk(device, &partno);
-       if (gd)
-               return gd;
-
-       di = kmalloc(sizeof(struct xlbd_disk_info), GFP_KERNEL);
-       if (di == NULL)
-               return NULL;
-       di->mi = mi;
-       di->xd_device = xd->device;
-
-       /* Construct an appropriate gendisk structure. */
-       gd = alloc_disk(1);
-       if (gd == NULL)
-               goto out;
-
-       gd->major = mi->major;
-       gd->first_minor = xd_minor;
-       gd->fops = &xlvbd_block_fops;
-       gd->private_data = di;
-       sprintf(gd->disk_name, "%s%c%d", mi->type->name,
-           'a' + (xd_minor >> mi->type->partn_shift),
-           xd_minor & ((1 << mi->type->partn_shift) - 1));
-       /*  sprintf(gd->devfs_name, "%s%s/disc%d", mi->type->name, , ); XXXdevfs */
-
-       set_capacity(gd, xd->capacity);
-
-       if (xlbd_blk_queue == NULL) {
-               xlbd_blk_queue = blk_init_queue(do_blkif_request,
-                                               &blkif_io_lock);
-               if (xlbd_blk_queue == NULL)
-                       goto out;
-               elevator_init(xlbd_blk_queue, &elevator_noop);
-
-               /*
-                * Turn off barking 'headactive' mode. We dequeue
-                * buffer heads as soon as we pass them to back-end
-                * driver.
-                */
-               blk_queue_headactive(xlbd_blk_queue, 0); /* XXXcl: noop according to blkdev.h */
-
-               blk_queue_hardsect_size(xlbd_blk_queue,
-                                       mi->type->hardsect_size);
-               blk_queue_max_sectors(xlbd_blk_queue, mi->type->max_sectors); /* 'hwif->rqsize' if we knew it */
-
-               /* XXXcl: set mask to PAGE_SIZE for now, to improve either use 
-                  - blk_queue_merge_bvec to merge requests with adjacent ma's
-                  - the tags infrastructure
-                  - the dma infrastructure
-               */
-               blk_queue_segment_boundary(xlbd_blk_queue, PAGE_SIZE - 1);
-
-               blk_queue_max_phys_segments(xlbd_blk_queue,
-                    BLKIF_MAX_SEGMENTS_PER_REQUEST);
-               blk_queue_max_hw_segments(xlbd_blk_queue,
-                    BLKIF_MAX_SEGMENTS_PER_REQUEST); /* XXXcl not needed? */
-
-
-       }
-       gd->queue = xlbd_blk_queue;
-
-       add_disk(gd);
-
-       return gd;
-
- out:
-       if (gd)
-               del_gendisk(gd);
-       kfree(di);
-       return NULL;
-}
-
-/*
- * xlvbd_init_device - initialise a VBD device
- * @disk:              a vdisk_t describing the VBD
- *
- * Takes a vdisk_t * that describes a VBD the domain has access to.
- * Performs appropriate initialisation and registration of the device.
- *
- * Care needs to be taken when making re-entrant calls to ensure that
- * corruption does not occur.  Also, devices that are in use should not have
- * their details updated.  This is the caller's responsibility.
- */
-static int xlvbd_init_device(vdisk_t *xd)
-{
-       struct block_device *bd;
-       struct gendisk *gd;
-       struct xlbd_major_info *mi;
-       int device;
-       int minor;
-
-       int err = -ENOMEM;
-
-       mi = xlbd_get_major_info(xd->device, &minor);
-       if (mi == NULL)
-               return -EPERM;
-
-       device = MKDEV(mi->major, minor);
-
-       if ((bd = bdget(device)) == NULL)
-               return -EPERM;
-
-       /*
-        * Update of partition info, and check of usage count, is protected
-        * by the per-block-device semaphore.
-        */
-       down(&bd->bd_sem);
-
-       gd = xlvbd_get_gendisk(mi, minor, xd);
-       if (mi == NULL) {
-               err = -EPERM;
-               goto out;
-       }
-
-       if (VDISK_READONLY(xd->info))
-               set_disk_ro(gd, 1); 
-
-       /* Some final fix-ups depending on the device type */
-       switch (VDISK_TYPE(xd->info)) { 
-       case VDISK_TYPE_CDROM:
-               gd->flags |= GENHD_FL_REMOVABLE | GENHD_FL_CD; 
-               /* FALLTHROUGH */
-       case VDISK_TYPE_FLOPPY: 
-       case VDISK_TYPE_TAPE:
-               gd->flags |= GENHD_FL_REMOVABLE; 
-               break; 
-
-       case VDISK_TYPE_DISK:
-               break; 
-
-       default:
-               printk(KERN_ALERT "XenLinux: unknown device type %d\n", 
-                   VDISK_TYPE(xd->info)); 
-               break; 
-       }
-
-       err = 0;
- out:
-       up(&bd->bd_sem);
-       bdput(bd);    
-       return err;
-}
-
-#if 0
-/*
- * xlvbd_remove_device - remove a device node if possible
- * @device:       numeric device ID
- *
- * Updates the gendisk structure and invalidates devices.
- *
- * This is OK for now but in future, should perhaps consider where this should
- * deallocate gendisks / unregister devices.
- */
-static int xlvbd_remove_device(int device)
-{
-    int i, rc = 0, minor = MINOR(device);
-    struct gendisk *gd;
-    struct block_device *bd;
-    xen_block_t *disk = NULL;
-
-    if ( (bd = bdget(device)) == NULL )
-        return -1;
-
-    /*
-     * Update of partition info, and check of usage count, is protected
-     * by the per-block-device semaphore.
-     */
-    down(&bd->bd_sem);
-
-    if ( ((gd = get_gendisk(device)) == NULL) ||
-         ((disk = xldev_to_xldisk(device)) == NULL) )
-        BUG();
-
-    if ( disk->usage != 0 )
-    {
-        printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device);
-        rc = -1;
-        goto out;
-    }
-    if ( (minor & (gd->max_p-1)) != 0 )
-    {
-        /* 1: The VBD is mapped to a partition rather than a whole unit. */
-        invalidate_device(device, 1);
-       gd->part[minor].start_sect = 0;
-        gd->part[minor].nr_sects   = 0;
-        gd->sizes[minor]           = 0;
-
-        /* Clear the consists-of-virtual-partitions flag if possible. */
-        gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS;
-        for ( i = 1; i < gd->max_p; i++ )
-            if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 )
-                gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
-
-        /*
-         * If all virtual partitions are now gone, and a 'whole unit' VBD is
-         * present, then we can try to grok the unit's real partition table.
-         */
-        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
-             (gd->sizes[minor & ~(gd->max_p-1)] != 0) &&
-             !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) )
-        {
-            register_disk(gd,
-                          device&~(gd->max_p-1), 
-                          gd->max_p, 
-                          &xlvbd_block_fops,
-                          gd->part[minor&~(gd->max_p-1)].nr_sects);
-        }
-    }
-    else
-    {
-        /*
-         * 2: The VBD is mapped to an entire 'unit'. Clear all partitions.
-         * NB. The partition entries are only cleared if there are no VBDs
-         * mapped to individual partitions on this unit.
-         */
-        i = gd->max_p - 1; /* Default: clear subpartitions as well. */
-        if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
-            i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */
-        while ( i >= 0 )
-        {
-            invalidate_device(device+i, 1);
-            gd->part[minor+i].start_sect = 0;
-            gd->part[minor+i].nr_sects   = 0;
-            gd->sizes[minor+i]           = 0;
-            i--;
-        }
-    }
-
- out:
-    up(&bd->bd_sem);
-    bdput(bd);
-    return rc;
-}
-
-/*
- * xlvbd_update_vbds - reprobes the VBD status and performs updates driver
- * state. The VBDs need to be updated in this way when the domain is
- * initialised and also each time we receive an XLBLK_UPDATE event.
- */
-void xlvbd_update_vbds(void)
-{
-    int i, j, k, old_nr, new_nr;
-    vdisk_t *old_info, *new_info, *merged_info;
-
-    old_info = vbd_info;
-    old_nr   = nr_vbds;
-
-    new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
-    if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
-    {
-        kfree(new_info);
-        return;
-    }
-
-    /*
-     * Final list maximum size is old list + new list. This occurs only when
-     * old list and new list do not overlap at all, and we cannot yet destroy
-     * VBDs in the old list because the usage counts are busy.
-     */
-    merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
-
-    /* @i tracks old list; @j tracks new list; @k tracks merged list. */
-    i = j = k = 0;
-
-    while ( (i < old_nr) && (j < new_nr) )
-    {
-        if ( old_info[i].device < new_info[j].device )
-        {
-            if ( xlvbd_remove_device(old_info[i].device) != 0 )
-                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
-            i++;
-        }
-        else if ( old_info[i].device > new_info[j].device )
-        {
-            if ( xlvbd_init_device(&new_info[j]) == 0 )
-                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
-            j++;
-        }
-        else
-        {
-            if ( ((old_info[i].capacity == new_info[j].capacity) &&
-                  (old_info[i].info == new_info[j].info)) ||
-                 (xlvbd_remove_device(old_info[i].device) != 0) )
-                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
-            else if ( xlvbd_init_device(&new_info[j]) == 0 )
-                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
-            i++; j++;
-        }
-    }
-
-    for ( ; i < old_nr; i++ )
-    {
-        if ( xlvbd_remove_device(old_info[i].device) != 0 )
-            memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
-    }
-
-    for ( ; j < new_nr; j++ )
-    {
-        if ( xlvbd_init_device(&new_info[j]) == 0 )
-            memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
-    }
-
-    vbd_info = merged_info;
-    nr_vbds  = k;
-
-    kfree(old_info);
-    kfree(new_info);
-}
-#endif
-
-/*
- * Set up all the linux device goop for the virtual block devices
- * (vbd's) that we know about. Note that although from the backend
- * driver's p.o.v. VBDs are addressed simply an opaque 16-bit device
- * number, the domain creation tools conventionally allocate these
- * numbers to correspond to those used by 'real' linux -- this is just
- * for convenience as it means e.g. that the same /etc/fstab can be
- * used when booting with or without Xen.
- */
-int xlvbd_init(void)
-{
-       int i;
-
-       /*
-        * If compiled as a module, we don't support unloading yet. We
-        * therefore permanently increment the reference count to
-        * disallow it.
-        */
-       MOD_INC_USE_COUNT;
-
-       memset(major_info, 0, sizeof(major_info));
-
-       for (i = 0; i < sizeof(major_info) / sizeof(major_info[0]); i++) {
-       }
-
-       vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
-       nr_vbds  = xlvbd_get_vbd_info(vbd_info);
-
-       if (nr_vbds < 0) {
-               kfree(vbd_info);
-               vbd_info = NULL;
-               nr_vbds  = 0;
-       } else {
-               for (i = 0; i < nr_vbds; i++)
-                       xlvbd_init_device(&vbd_info[i]);
-       }
-
-       return 0;
-}